def main(): if len(sys.argv) < 5: print "Usage: %s <species> <exon_id> <exon_known> <output_name_root>" % sys.argv[0] exit(1) species = sys.argv[1] exon_id = int(sys.argv[2]) exon_known = int(sys.argv[3]) output_fnm_root = sys.argv[4] local_db = False if local_db: db = connect_to_mysql() cfg = ConfigurationReader() else: db = connect_to_mysql (user="******", passwd="sqljupitersql", host="jupiter.private.bii", port=3307) cfg = ConfigurationReader (user="******", passwd="sqljupitersql", host="jupiter.private.bii", port=3307) cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) sorted_species = species_sort(cursor, all_species, species) reconstruct_alignment (cursor, cfg, ensembl_db_name, species, exon_id, exon_known, sorted_species, output_fnm_root) cursor.close() db.close() return True
def main(): if len(sys.argv) < 5: print "Usage: %s <species> <exon_id> <exon_known> <output_name_root>" % sys.argv[ 0] exit(1) species = sys.argv[1] exon_id = int(sys.argv[2]) exon_known = int(sys.argv[3]) output_fnm_root = sys.argv[4] db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) sorted_species = species_sort(cursor, all_species, species) reconstruct_alignment(cursor, cfg, ensembl_db_name, species, exon_id, exon_known, sorted_species, output_fnm_root) cursor.close() db.close() return True
def annotate(gene_list, db_info): # [local_db, all_species, ensembl_db_name, species] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() if verbose: print "thread %s annotating %s " % (get_thread_name(), species) if not species == "oryctolagus_cuniculus": print "The preferred list of species is hardcoded for the rabbit. Consider modifying." exit(1) preferred_species = [species, "mus_musculus", "rattus_norvegicus", "homo_sapiens"] nearest_species_list = species_sort(cursor, all_species, species) species_list = preferred_species + filter(lambda x: x not in preferred_species, nearest_species_list) inf = erropen("temp_out.fasta", "w") for gene_id in gene_list: # for gene_id in [90020]: switch_to_db(cursor, ensembl_db_name[species]) #################### # get stable id and description of this gene stable_id = gene2stable(cursor, gene_id) if not gene_list.index(gene_id) % 100: print gene_list.index(gene_id), "out of", len(gene_list) if verbose: print "=============================================" if verbose: print gene_id, stable_id #################### # find the annotation from the preferred source organism [annot_source, orthology_type, annotation, ortho_stable_ids] = find_annotation( cursor, ensembl_db_name, species_list, gene_id ) if verbose: print annot_source, "**", orthology_type, "**", annotation ################### # find splices (for now find the canonical splice) switch_to_db(cursor, ensembl_db_name[species]) canonical_splice = get_canonical_transl(acg, cursor, gene_id, species) # output if orthology_type == "self" or annotation == "none": header = ">{0} {1}".format(stable_id, annotation) else: header = ">{0} {1} [by sim to {2}, {3}]".format(stable_id, annotation, annot_source, ortho_stable_ids) print >> inf, header print >> inf, canonical_splice cursor.close() db.close()
def annotate(gene_list, db_info): # [local_db, all_species, ensembl_db_name, species] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() if verbose: print "thread %s annotating %s " % (get_thread_name(), species) if not species == 'oryctolagus_cuniculus': print 'The preferred list of species is hardcoded for the rabbit. Consider modifying.' exit(1) preferred_species = [ species, 'mus_musculus', 'rattus_norvegicus', 'homo_sapiens' ] nearest_species_list = species_sort(cursor, all_species, species) species_list = preferred_species + filter( lambda x: x not in preferred_species, nearest_species_list) inf = erropen("temp_out.fasta", "w") for gene_id in gene_list: #for gene_id in [90020]: switch_to_db(cursor, ensembl_db_name[species]) #################### # get stable id and description of this gene stable_id = gene2stable(cursor, gene_id) if not gene_list.index(gene_id) % 100: print gene_list.index(gene_id), "out of", len(gene_list) if verbose: print "=============================================" if verbose: print gene_id, stable_id #################### # find the annotation from the preferred source organism [annot_source, orthology_type, annotation, ortho_stable_ids] = find_annotation(cursor, ensembl_db_name, species_list, gene_id) if verbose: print annot_source, "**", orthology_type, '**', annotation ################### # find splices (for now find the canonical splice) switch_to_db(cursor, ensembl_db_name[species]) canonical_splice = get_canonical_transl(acg, cursor, gene_id, species) # output if orthology_type == 'self' or annotation == 'none': header = ">{0} {1}".format(stable_id, annotation) else: header = ">{0} {1} [by sim to {2}, {3}]".format( stable_id, annotation, annot_source, ortho_stable_ids) print >> inf, header print >> inf, canonical_splice cursor.close() db.close()
def get_template(cursor, ensembl_db_name, map_table, species, he): template_species = None template_seq = None nearest_species = species_sort(cursor, list(map_table.keys()), species)[1:] # I have a problem with the lamprey - it is an outlayer to everything else if species == 'petromyzon_marinus': nearest_species.reverse() exon = Exon() len_human_protein_seq = 1.0 * len(he.pepseq) for nearest in nearest_species: if not map_table[nearest][he]: continue m = map_table[nearest][he] if m and m.warning: continue template_seqs = get_exon_seqs(cursor, m.exon_id_2, m.exon_known_2, ensembl_db_name[nearest]) if not template_seqs: template_species = None else: [ exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq ] = template_seqs if len(protein_seq) / len_human_protein_seq < 0.3: continue if len_human_protein_seq / len(protein_seq) < 0.3: continue if not left_flank or not right_flank: continue if "XX" in protein_seq: continue template_species = nearest template_exon_id = m.exon_id_2 template_exon_known = m.exon_known_2 template_exon_seq_id = exon_seq_id template_similarity_to_human = m.similarity break if not template_species: return None return [ template_species, template_exon_seq_id, dna_seq, protein_seq, template_similarity_to_human ]
def get_template (cursor, ensembl_db_name, map_table, species, he): template_species = None template_seq = None nearest_species = species_sort(cursor, map_table.keys(), species)[1:] # I have a problem with the lamprey - it is an outlayer to everything else if species=='petromyzon_marinus': nearest_species.reverse() exon = Exon() len_human_protein_seq = 1.0*len(he.pepseq) for nearest in nearest_species: if not map_table[nearest][he]: continue m = map_table[nearest][he] if m and m.warning: continue template_seqs = get_exon_seqs (cursor, m.exon_id_2, m.exon_known_2, ensembl_db_name[nearest]) if not template_seqs: template_species = None else: [exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = template_seqs if len(protein_seq)/len_human_protein_seq < 0.3: continue if len_human_protein_seq/len(protein_seq) < 0.3: continue if not left_flank or not right_flank: continue if "XX" in protein_seq: continue template_species = nearest template_exon_id = m.exon_id_2 template_exon_known = m.exon_known_2 template_exon_seq_id = exon_seq_id template_similarity_to_human = m.similarity break if not template_species: return None return [template_species, template_exon_seq_id, dna_seq, protein_seq, template_similarity_to_human]
def main(): if len(sys.argv) < 5: print "Usage: %s <species> <exon_id> <exon_known> <output_name_root>" % sys.argv[0] exit(1) species = sys.argv[1] exon_id = int(sys.argv[2]) exon_known = int(sys.argv[3]) output_fnm_root = sys.argv[4] db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) sorted_species = species_sort(cursor, all_species, species) reconstruct_alignment (cursor, cfg, ensembl_db_name, species, exon_id, exon_known, sorted_species, output_fnm_root) cursor.close() db.close() return True
def find_missing_exons(human_gene_list, db_info): # [local_db, ensembl_db_name, method] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids and common names for each species db all_species, ensembl_db_name = get_species(cursor) # minimal acceptable similarity between exons min_similarity = cfg.get_value('min_accptbl_exon_sim') switch_to_db(cursor, ensembl_db_name['homo_sapiens']) ################################################################################## # loop over human genes gene_ct = 0 found = 0 sought = 0 unsequenced = 0 #human_gene_list.reverse() for human_gene_id in human_gene_list: switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # Get stable id and description of this gene -- DEBUG human_stable = gene2stable(cursor, human_gene_id) human_description = get_description(cursor, human_gene_id) if verbose: print(human_gene_id, human_stable, human_description) # progress counter gene_ct += 1 if (not gene_ct % 10): print("processed ", gene_ct, " out of ", len(human_gene_list), "genes") print("exons found: ", found, " out of ", sought, "sought") # find all human exons for this gene that we are tracking in the database human_exons = [ e for e in gene2exon_list(cursor, human_gene_id) if e.covering_exon < 0 and e.is_canonical and e.is_known ] if not human_exons: print("\t\t no exons found") continue human_exons.sort(key=lambda exon: exon.start_in_gene) for he in human_exons: he.stable_id = exon2stable(cursor, he.exon_id) ################################################################################## ################################################################################## # make 'table' of maps, which is either pointer to the map if it exists, or None map_table = {} for species in all_species: map_table[species] = {} for he in human_exons: map_table[species][he] = None ################# maps_for_exon = {} for he in human_exons: maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data for m in maps_for_exon[he]: #if m.source == 'usearch': continue #if m.source == 'sw_sharp': continue #if m.source == 'sw_sharp': # print 'sw_sharp' #if m.source == 'usearch': # print 'usearch', m.similarity, m.species_2, m.exon_id_1, m.exon_id_2 if m.similarity < min_similarity: continue m_previous = map_table[m.species_2][he] if m_previous and m_previous.similarity > m.similarity: continue map_table[m.species_2][he] = m # get rid of species that do not have the gene at all for species in all_species: one_exon_found = False for he in human_exons: if map_table[species][he]: one_exon_found = True break if not one_exon_found: del map_table[species] # fill in the peptide sequence field for each human exon # get rid of exons that appear in no other species but human (?) bad_he = [] for he in human_exons: one_species_found = False he.pepseq = get_exon_pepseq(cursor, he, ensembl_db_name['homo_sapiens']) if len( he.pepseq ) < 3: # can I ever get rid of all the nonsense I find in Ensembl? bad_he.append(he) continue for species in list(map_table.keys()): if species == 'homo_sapiens': continue if map_table[species][he]: one_species_found = True break if not one_species_found: bad_he.append(he) human_exons = [he for he in human_exons if not he in bad_he] # keep track of nearest neighbors for each human exon previous = {} next = {} prev = None for he in human_exons: previous[he] = prev if prev: next[prev] = he prev = he next[he] = None # fill, starting from the species that are nearest to the human if not list(map_table.keys()): continue # whatever species_sorted_from_human = species_sort(cursor, list(map_table.keys()), species)[1:] for species in species_sorted_from_human: print(species) # see which exons have which neighbors #if verbose: print he.exon_id, species no_left = [] no_right = [] has_both_neighbors = [] one_existing_map = None for he in human_exons: m = map_table[species][he] if m and not m.warning: # the one existing map should not be a problematic one one_existing_map = m continue prev = previous[he] nxt = next[he] if prev and nxt and map_table[species][prev] and map_table[ species][nxt]: has_both_neighbors.append(he) elif not prev or not map_table[species][prev]: no_left.append(he) elif not nxt or not map_table[species][nxt]: no_right.append(he) if not one_existing_map: continue # this shouldn't happen if not has_both_neighbors and not no_left and not no_right: continue # what is the gene that we are talking about? exon_id = one_existing_map.exon_id_2 is_known = one_existing_map.exon_known_2 gene_id = exon_id2gene_id(cursor, ensembl_db_name[species], exon_id, is_known) # is it mitochondrial? mitochondrial = is_mitochondrial(cursor, gene_id, ensembl_db_name[species]) # where is the gene origin (position on the sequence) gene_coords = get_gene_coordinates(cursor, gene_id, ensembl_db_name[species]) if not gene_coords: continue [gene_seq_region_id, gene_start, gene_end, gene_strand] = gene_coords # fill in exons that have both neighbors: # human exon functions as a coordinate here for he in has_both_neighbors: # get template (known exon from the nearest species) template_info = get_template(cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file # get previous region prev_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # get following region next_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue sought += 1 reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply == 'NNN': unsequenced += 1 # work backwards # use the last known region on the left as the bound no_left.reverse() next_seq_region = None for he in no_left: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template(cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not next_seq_region: next_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue # otherwise it is the last thing we found # the previous region is eyeballed from the next on # the previous and the next region frame the search region prev_seq_region = left_region(next_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply == 'NNN': unsequenced += 1 # repeat the whole procedure on the right prev_seq_region = None for he in no_right: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template(cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not prev_seq_region: prev_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # otherwise it is the last thing we found # the following region is eyeballed from the previous next_seq_region = right_region(prev_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply == 'NNN': unsequenced += 1 print(species, "sought", sought, " unseq", unsequenced)
def find_missing_exons(human_gene_list, db_info): # [local_db, ensembl_db_name, method] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids and common names for each species db all_species, ensembl_db_name = get_species (cursor) # minimal acceptable similarity between exons min_similarity = cfg.get_value('min_accptbl_exon_sim') switch_to_db (cursor, ensembl_db_name['homo_sapiens']) ################################################################################## # loop over human genes gene_ct = 0 found = 0 sought = 0 unsequenced = 0 #human_gene_list.reverse() for human_gene_id in human_gene_list: switch_to_db (cursor, ensembl_db_name['homo_sapiens']) # Get stable id and description of this gene -- DEBUG human_stable = gene2stable (cursor, human_gene_id) human_description = get_description(cursor, human_gene_id) if verbose: print human_gene_id, human_stable, human_description # progress counter gene_ct += 1 if (not gene_ct%10): print "processed ", gene_ct, " out of ", len(human_gene_list), "genes" print "exons found: ", found, " out of ", sought, "sought" # find all human exons for this gene that we are tracking in the database human_exons = [e for e in gene2exon_list(cursor, human_gene_id) if e.covering_exon < 0 and e.is_canonical and e.is_known] if not human_exons: print "\t\t no exons found" continue human_exons.sort(key=lambda exon: exon.start_in_gene) for he in human_exons: he.stable_id = exon2stable (cursor, he.exon_id) ################################################################################## ################################################################################## # make 'table' of maps, which is either pointer to the map if it exists, or None map_table = {} for species in all_species: map_table[species] = {} for he in human_exons: map_table[species][he] = None ################# maps_for_exon = {} for he in human_exons: maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data for m in maps_for_exon[he]: #if m.source == 'usearch': continue #if m.source == 'sw_sharp': continue #if m.source == 'sw_sharp': # print 'sw_sharp' #if m.source == 'usearch': # print 'usearch', m.similarity, m.species_2, m.exon_id_1, m.exon_id_2 if m.similarity < min_similarity: continue m_previous = map_table[m.species_2][he] if m_previous and m_previous.similarity > m.similarity: continue map_table[m.species_2][he] = m # get rid of species that do not have the gene at all for species in all_species: one_exon_found = False for he in human_exons: if map_table[species][he]: one_exon_found = True break if not one_exon_found: del map_table[species] # fill in the peptide sequence field for each human exon # get rid of exons that appear in no other species but human (?) bad_he = [] for he in human_exons: one_species_found = False he.pepseq = get_exon_pepseq (cursor, he, ensembl_db_name['homo_sapiens']) if len (he.pepseq) < 3: # can I ever get rid of all the nonsense I find in Ensembl? bad_he.append(he) continue for species in map_table.keys(): if species =='homo_sapiens': continue if map_table[species][he]: one_species_found = True break if not one_species_found: bad_he.append(he) human_exons = filter (lambda he: not he in bad_he, human_exons) # keep track of nearest neighbors for each human exon previous = {} next = {} prev = None for he in human_exons: previous[he] = prev if prev: next[prev] = he prev = he next[he] = None # fill, starting from the species that are nearest to the human if not map_table.keys(): continue # whatever species_sorted_from_human = species_sort(cursor,map_table.keys(),species)[1:] for species in species_sorted_from_human: print species # see which exons have which neighbors #if verbose: print he.exon_id, species no_left = [] no_right = [] has_both_neighbors = [] one_existing_map = None for he in human_exons: m = map_table[species][he] if m and not m.warning: # the one existing map should not be a problematic one one_existing_map = m continue prev = previous[he] nxt = next[he] if prev and nxt and map_table[species][prev] and map_table[species][nxt]: has_both_neighbors.append(he) elif not prev or not map_table[species][prev]: no_left.append(he) elif not nxt or not map_table[species][nxt]: no_right.append(he) if not one_existing_map: continue # this shouldn't happen if not has_both_neighbors and not no_left and not no_right: continue # what is the gene that we are talking about? exon_id = one_existing_map.exon_id_2 is_known = one_existing_map.exon_known_2 gene_id = exon_id2gene_id (cursor, ensembl_db_name[species], exon_id, is_known) # is it mitochondrial? mitochondrial = is_mitochondrial(cursor, gene_id, ensembl_db_name[species]) # where is the gene origin (position on the sequence) gene_coords = get_gene_coordinates (cursor, gene_id, ensembl_db_name[species]) if not gene_coords: continue [gene_seq_region_id, gene_start, gene_end, gene_strand] = gene_coords # fill in exons that have both neighbors: # human exon functions as a coordinate here for he in has_both_neighbors: # get template (known exon from the nearest species) template_info = get_template (cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file # get previous region prev_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # get following region next_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue sought += 1 reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply=='NNN': unsequenced += 1 # work backwards # use the last known region on the left as the bound no_left.reverse() next_seq_region = None for he in no_left: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template (cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not next_seq_region: next_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue # otherwise it is the last thing we found # the previous region is eyeballed from the next on # the previous and the next region frame the search region prev_seq_region = left_region (next_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply=='NNN': unsequenced += 1 # repeat the whole procedure on the right prev_seq_region = None for he in no_right: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template (cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not prev_seq_region: prev_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # otherwise it is the last thing we found # the following region is eyeballed from the previous next_seq_region = right_region (prev_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply=='NNN': unsequenced += 1 print species, "sought", sought, " unseq", unsequenced
def main(): # exon_id comes from the command line if len(sys.argv) < 5: print "Usage: %s <exon_id> <exon_known> <species> <output name> [nt]" % sys.argv[0] exit (1) exon_id = long(sys.argv[1]) exon_known = int(sys.argv[2]) species = sys.argv[3] afa_name = sys.argv[4] nt = len(sys.argv)>5 and sys.argv[5]=='nt' ###################################### db = connect_to_mysql(user="******", passwd="sqljupitersql", host="jupiter.private.bii", port=3307) cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) if not is_coding_exon(cursor, exon_id, exon_known, ensembl_db_name[species]) and not nt: # make an empty file cmd = "touch " + afa_name ret = commands.getoutput(cmd) cursor.close() db.close() return ###################################### if (species == 'homo_sapiens'): [human_exon_id, human_exon_known] = [exon_id, exon_known] ok = True else: # find the human exon this guy maps to species_db_id = species2genome_db_id(cursor, species) if (species_db_id): [human_exon_id, human_exon_known] = find_human_cognate(cursor, ensembl_db_name, exon_id, exon_known, species_db_id) ok = species_db_id > 0 and human_exon_id>0 ###################################### if (ok): alignment = make_exon_alignment(cursor, ensembl_db_name, human_exon_id, human_exon_known, nt) if (ok and alignment): # sort the remaining species taxonomically sorted_species = species_sort(cursor, all_species, species) sorted_names = sort_names (sorted_species, alignment) output_fasta (afa_name, sorted_names, alignment) else: # make file consisting of the original sequence only if nt: seq = get_exon_seqs (cursor, exon_id, exon_known, ensembl_db_name[species])[-1]; else: seq = get_exon_pepseq (cursor, exon_id, exon_known, ensembl_db_name[species]) if seq: alignment = {} sequence_name = make_seq_name (cursor, ensembl_db_name, species, exon_id, exon_known, []) alignment[sequence_name] = seq; output_fasta (afa_name, [seq_name], alignment) else: # if not even the original sequence can be found, its definitely somebody else's fault; # make an empty file cmd = "touch " + afa_name ret = commands.getoutput(cmd) cursor.close() db.close() return
def main(): if len(sys.argv) < 2: print("usage: %s <gene symbol> [trivial] [prepend]" % sys.argv[0]) print( "trivial = use trivial species name; prepend = prepend gene name") exit() gene_name = sys.argv[1] trivial = "trivial" in sys.argv prepend = "prepend" in sys.argv # prepends geen synbol to gene name ref_species = 'homo_sapiens' # the orthologue table is filled only here, for the moment out_fasta = f"{gene_name}.orthos.fasta" out_afa = f"{gene_name}.orthos.afa" tmpfile = "tmp.fa" logfile = "tmp.log" for fnm in [out_fasta, out_afa, tmpfile, logfile]: if os.path.exists(fnm): os.remove(fnm) home = os.getcwd() db = connect_to_mysql(Config.mysql_conf_file) cursor = db.cursor() qry = "select ensembl_gene_id from identifier_maps.hgnc where approved_symbol='%s'" % gene_name ensembl_stable_gene_id = hard_landing_search(cursor, qry)[0][0] [all_species, ensembl_db_name] = get_species(cursor) # species_sort(cursor, all_species, 'homo_sapiens') # exit() switch_to_db(cursor, ensembl_db_name[ref_species]) qry = "select gene_id from gene where stable_id='%s'" % ensembl_stable_gene_id gene_id = hard_landing_search(cursor, qry)[0][0] ref_stable_transl_id = gene2stable_canon_transl_id( cursor, gene_id, ensembl_db_name[ref_species]) write_to_fasta(home, ref_species, ref_stable_transl_id, tmpfile, logfile, out_fasta) print(gene_name, ensembl_stable_gene_id, gene_id, ref_stable_transl_id) species_in_the_almt = [ref_species] qry = "select cognate_gene_id, cognate_genome_db_id from orthologues where gene_id=%d" % gene_id for line in error_intolerant_search(cursor, qry): [cognate_gene_id, cognate_genome_db_id] = line qry = f"select db_name from exolocator_meta.db_names where genome_db_id={cognate_genome_db_id}" db_name = hard_landing_search(cursor, qry)[0][0] stable_transl_id = gene2stable_canon_transl_id(cursor, cognate_gene_id, db_name) species = db_name.split("core")[0].rstrip("_") if species not in all_species: continue print(db_name, species, cognate_gene_id, stable_transl_id) ok = write_to_fasta(home, species, stable_transl_id, tmpfile, logfile, out_fasta) if ok: species_in_the_almt.append(species) if os.path.exists(tmpfile): os.remove(tmpfile) cmd = f"{Config.muscle} -in {out_fasta} -out tmp.afa" subprocess.call(["bash", "-c", cmd]) species_sorted = species_sort(cursor, species_in_the_almt, ref_species) trivial_names = get_trivial(cursor, species_sorted) if trivial else None name_prefix = gene_name if prepend else None reorder_seqs('tmp.afa', species_sorted, out_afa, trivial_names, name_prefix) if os.path.exists('tmp.afa'): os.remove('tmp.afa') cursor.close() db.close() return True