def main (): referenced_species = "Homo_sapiens" # 'ENSP00000253108', 'Ailuropoda_melanoleuca', 'ensembl' protein_list_raw = FileUtilities.get_protein_list() protein_list = [] for protein_tuple in protein_list_raw: protein_list.append(protein_tuple[0]) fill_all_containers(False) if(len(sys.argv) < 1): print "Usage: {0} <blastn | tblastn | SW_gene | SW_exon | all> \n".format(sys.argv[0]) exit mode = sys.argv[1] populate_referenced_species_databases(protein_list, referenced_species) if (mode == "blastn"): populate_blastn_alignments(protein_list) elif (mode == "tblastn"): populate_tblastn_alignments(protein_list) elif (mode == "SW_gene"): populate_SW_gene_alignments(protein_list) elif (mode == "SW_exon"): populate_SW_exon_alignments(protein_list) elif (mode == "all"): populate_blastn_alignments(protein_list) populate_tblastn_alignments(protein_list) populate_SW_gene_alignments(protein_list) populate_SW_exon_alignments(protein_list) else: print "Usage: {0} <blastn | tblastn | SW_gene | SW_exon | all> \n".format(sys.argv[0]) exit
def fill_all_containers (load_alignments): ''' Fills all the containers with correspondent data. The containers are: data maps, proteins, genes, transcripts, ensembl exons, and all the alignment exons ''' dc = DirectoryCrawler() protein_list_raw = FileUtilities.get_protein_list() # flatten the raw protein list and take every second element, which is a protein id protein_list = list(chain.from_iterable(protein_list_raw))[0::2] algorithms = ["blastn", "tblastn", "sw_gene", "sw_exon"] for protein_id in protein_list: dc.generate_directory_tree(protein_id) ens_exon_container = load_protein_configuration_batch(protein_list) if ens_exon_container: load_exon_configuration_batch(protein_list, "ensembl") load_exon_configuration_batch(protein_list, "genewise") if load_alignments: load_exon_configuration_batch (protein_list, "blastn") load_exon_configuration_batch(protein_list, "tblastn") load_exon_configuration_batch(protein_list, "sw_gene") load_exon_configuration_batch(protein_list, "sw_exon") set_frames_to_coding_exons_batch (protein_list) remove_overlapping_alignments_batch(protein_list, ["blastn", "tblastn"]) annotate_spurious_alignments_batch(protein_list, algorithms)
def populate_exon_table(): dbm = DBManager.Instance() ec = ExonContainer.Instance() dmc = DataMapContainer.Instance() protein_id_list = FileUtilities.get_protein_list() species_list = FileUtilities.get_default_species_list() exon_type_list = ["ensembl", "genewise", "blastn", "tblastn", "sw_gene"] exon_list = [] for ref_protein_id in protein_id_list: for species in species_list: for exon_type in exon_type_list: exon_key = (ref_protein_id[0], species, exon_type) try: exons = ec.get(exon_key).get_ordered_exons() for exon in exons: if type(exon) is Exon: if exon.viability: exon_list.append(exon) else: exon_list.append(exon) except KeyError: pass dbm.update_exon_table(exon_list) dbm.update_alignment_table(exon_list)
def populate_ortholog_table(): dbm = DBManager.Instance() dmc = DataMapContainer.Instance() protein_id_list = FileUtilities.get_protein_list() species_list = FileUtilities.get_default_species_list() data_map_list = [] for ref_protein_id in protein_id_list: for species in species_list: try: data_map = dmc.get((ref_protein_id[0], species)) data_map_list.append(data_map) except KeyError, e: pass
def populate_gene_table(): dbm = DBManager.Instance() dmc = DataMapContainer.Instance() protein_id_list = FileUtilities.get_protein_list() species_list = FileUtilities.get_default_species_list() data_map_list = [] for ref_protein_id in protein_id_list: for species in species_list: try: data_map = dmc.get((ref_protein_id[0], species)) data_map_list.append(data_map) except KeyError: print "PROTEIN_ID %s ERROR" % (ref_protein_id[0]) print data_map_list dbm.update_gene_table(data_map_list)
def populate_protein_table(): dbm = DBManager.Instance() pc = ProteinContainer.Instance() dmc = DataMapContainer.Instance() protein_id_list = FileUtilities.get_protein_list() species_list = FileUtilities.get_default_species_list() protein_list = [] for ref_protein_id in protein_id_list: for species in species_list: try: protein_id = dmc.get((ref_protein_id[0], species)) protein = pc.get(protein_id.protein_id) protein_list.append(protein) except KeyError: print "PROTEIN_ID %s ERROR" % (ref_protein_id[0]) dbm.update_protein_table(protein_list)
def populate_exon_alignment_piece_table(): dbm = DBManager.Instance() ec = ExonContainer.Instance() beac = BestExonAlignmentContainer.Instance() protein_id_list = FileUtilities.get_protein_list() species_list = FileUtilities.get_default_species_list() exon_aln_list = [] for (ref_protein_id, exon_num) in protein_id_list: for species in species_list: try: ref_exons = ec.get((ref_protein_id, 'Homo_sapiens', 'ensembl')) for ref_exon in ref_exons.get_coding_exons(): best_exon_alignment = beac.get(ref_exon.exon_id, species) if best_exon_alignment and best_exon_alignment.sw_gene_alignment: for aln_piece in best_exon_alignment.sw_gene_alignment.alignment_pieces: if aln_piece.type in ('coding', 'insertion'): exon_aln_list.append([ref_exon.exon_id, species, aln_piece]) except KeyError, e: print e