def main(): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) cursor.close() db.close() outpath = cfg.get_path('afs_dumps') outdir = "{0}/exon_map".format(outpath) if (not os.path.exists(outdir)): mkdir_p(outdir) outfile = "{0}/exon_map.sql".format(outdir) if os.path.exists('.creds'): [user, passwd, host, port] = read_creds() else: print "creds not found" exit(1) credentials = " -h {0} -P {1} -u {2} -p{3}".format( host, port, user, password) cmd = "mysqldump {0} {1} exon_map > {2}".format( credentials, ensembl_db_name['homo_sapiens'], outfile) print cmd ret = commands.getoutput(cmd) print ret return True
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db(cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) inpath = cfg.get_path('afs_dumps') indir = "%s/exon_map" % inpath infile = "%s/exon_map.sql" % indir if (not os.path.exists(infile)): print "not found: ", infile sys.exit(1) print "reading", infile qry = "drop table exon_map" rows = search_db(cursor, qry) # I could not get this to run, though it runs fine directly from the mysql shell: #qry = "source %s" % infile #rows = search_db(cursor, qry, verbose=True) cursor.close() db.close() credentials = " -u marioot -ptooiram" cmd = "mysql %s exolocator_db < %s" % (credentials, infile) print cmd ret = commands.getoutput(cmd) print ret return True
def main(): no_threads = 1 db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) # afs is killing me here ... in_path = cfg.get_path('afs_dumps')+"/exons" if (not os.path.exists(in_path)): print in_path, "not found" cursor.close() db .close() ############### os.chdir(in_path) filenames = glob.glob("*exon_dump.txt") parallelize (no_threads, load_from_infiles, filenames, in_path)
def main (): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) cursor.close() db .close() outpath = cfg.get_path('afs_dumps') outdir = "{0}/exon_map".format(outpath) if (not os.path.exists(outdir)): mkdir_p(outdir) outfile = "{0}/exon_map.sql".format(outdir) if os.path.exists('.creds'): [user, passwd, host, port] = read_creds() else: print "creds not found" exit(1) credentials = " -h {0} -P {1} -u {2} -p{3}".format(host, port, user, password) cmd = "mysqldump {0} {1} exon_map > {2}".format (credentials, ensembl_db_name['homo_sapiens'], outfile) print cmd ret = commands.getoutput(cmd) print ret return True
def main (): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader (user="******", passwd="tooiram", check=False) inpath = cfg.get_path('afs_dumps') indir = "%s/exon_map" % inpath infile = "%s/exon_map.sql" % indir if (not os.path.exists(infile)): print "not found: ", infile sys.exit(1) print "reading", infile qry = "drop table exon_map" rows = search_db(cursor, qry) # I could not get this to run, though it runs fine directly from the mysql shell: #qry = "source %s" % infile #rows = search_db(cursor, qry, verbose=True) cursor.close() db.close() credentials = " -u marioot -ptooiram" cmd = "mysql %s exolocator_db < %s" % (credentials, infile) print cmd ret = commands.getoutput(cmd) print ret return True
def main(): no_threads = 1 db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) in_path = cfg.get_path('resources') if (not os.path.exists(in_path)): print in_path, "not found" ############### if not check_table_exists (cursor, db_name, 'name_resolution'): make_name_resolution_table (cursor) ############### os.chdir(in_path) filenames = glob.glob("*name_resolution.txt") for infile in filenames: store (cursor, in_path, infile) ############### cursor.close() db .close()
def main(): db = connect_to_mysql() cr = ConfigurationReader() cursor = db.cursor() fasta_path = cr.get_path('ensembl_fasta') [all_species, ensembl_db_name] = get_species (cursor) for species in all_species: #for species in ['danio_rerio']: print species dna_path = "{0}/{1}/dna".format(fasta_path, species) if (not os.path.exists(dna_path)): print "problem:", dna_path, "not found" exit(1) fasta_files = [] for r,d,files in os.walk(dna_path): for file in files: if (not file[-3:] == ".fa"): continue fasta_files.append(file) name2file = {} for file in fasta_files: print dna_path, file cmd = "grep '>' {0}/{1}".format(dna_path, file) ret = commands.getoutput(cmd) headers = ret.split("\n") print "number of headers: ", len(headers) for hdr in headers: fields = hdr.split(" ") name = fields[0].replace (">", "") #print name if (not name2file.has_key(name)): name2file[name] = [] name2file[name].append(file) qry = "use "+ensembl_db_name[species] search_db (cursor, qry) for name in name2file.keys(): file_names = "" for file in name2file[name]: if file_names: file_names += " " file_names += file store_seq_filenames (cursor, name, file_names) cursor.close() db .close()
def main(): if len(sys.argv) < 5: print "Usage: %s <species> <exon_id> <exon_known> <output_name_root>" % sys.argv[ 0] exit(1) species = sys.argv[1] exon_id = int(sys.argv[2]) exon_known = int(sys.argv[3]) output_fnm_root = sys.argv[4] db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) sorted_species = species_sort(cursor, all_species, species) reconstruct_alignment(cursor, cfg, ensembl_db_name, species, exon_id, exon_known, sorted_species, output_fnm_root) cursor.close() db.close() return True
def annotate(gene_list, db_info): # [local_db, all_species, ensembl_db_name, species] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() if verbose: print "thread %s annotating %s " % (get_thread_name(), species) if not species == 'oryctolagus_cuniculus': print 'The preferred list of species is hardcoded for the rabbit. Consider modifying.' exit(1) preferred_species = [ species, 'mus_musculus', 'rattus_norvegicus', 'homo_sapiens' ] nearest_species_list = species_sort(cursor, all_species, species) species_list = preferred_species + filter( lambda x: x not in preferred_species, nearest_species_list) inf = erropen("temp_out.fasta", "w") for gene_id in gene_list: #for gene_id in [90020]: switch_to_db(cursor, ensembl_db_name[species]) #################### # get stable id and description of this gene stable_id = gene2stable(cursor, gene_id) if not gene_list.index(gene_id) % 100: print gene_list.index(gene_id), "out of", len(gene_list) if verbose: print "=============================================" if verbose: print gene_id, stable_id #################### # find the annotation from the preferred source organism [annot_source, orthology_type, annotation, ortho_stable_ids] = find_annotation(cursor, ensembl_db_name, species_list, gene_id) if verbose: print annot_source, "**", orthology_type, '**', annotation ################### # find splices (for now find the canonical splice) switch_to_db(cursor, ensembl_db_name[species]) canonical_splice = get_canonical_transl(acg, cursor, gene_id, species) # output if orthology_type == 'self' or annotation == 'none': header = ">{0} {1}".format(stable_id, annotation) else: header = ">{0} {1} [by sim to {2}, {3}]".format( stable_id, annotation, annot_source, ortho_stable_ids) print >> inf, header print >> inf, canonical_splice cursor.close() db.close()
def main(): special = 'test' no_threads = 10 method = 'usearch' if len(sys.argv) > 1 and len(sys.argv) < 4: print("usage: %s <set name> <number of threads> <method>" % sys.argv[0]) exit(1) elif len(sys.argv) == 4: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) method = sys.argv[3] if not (method == 'usearch' or method == 'sw_sharp'): print("unrecognized method: ", method) exit(1) # sw_sharps chokes if there is only one graphics card if method == 'sw_sharp': no_threads = 1 db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) print('=======================================') print(sys.argv[0]) if special: print("using", special, "set") if special == 'complement': gene_list = get_complement_ids(cursor, ensembl_db_name, cfg) else: gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special) else: print("using all protein coding genes") switch_to_db(cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize(no_threads, find_missing_exons, gene_list, [local_db, ensembl_db_name, method]) return True
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader (user="******", passwd="tooiram", check=False) in_path = cfg.get_path('afs_dumps') in_path += "/para_dump" if (not os.path.exists(in_path)): print in_path, "not found" sys.exit(1) # exit on non-existent outdir ############### if 1: qry = "drop table paralog" search_db (cursor, qry) qry = "create table paralog (id int(10) primary key auto_increment) " search_db (cursor, qry) qry = "alter table paralog ADD gene_id1 varchar(30) " search_db (cursor, qry) qry = "alter table paralog ADD gene_id2 varchar(30) " search_db (cursor, qry) create_index (cursor, db_name,'gene_id_index', 'paralog', ['gene_id1', 'gene_id2']) ############### os.chdir(in_path) filenames = glob.glob("*_para_dump.txt") ############### for infile in filenames: print infile store(cursor, infile) cursor.close() db .close()
def dump_orthos (species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) # in the afa headers use 'trivial' names for the species: cow, dog, pig, ... trivial_name = translate_to_trivial(cursor, all_species) out_path = cfg.get_path('afs_dumps') outfile = "{0}/orthologue_dump.txt".format(out_path) print outfile of = erropen (outfile,"w") species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) qry = "select * from orthologue" rows = search_db (cursor, qry) for row in rows: [pair_id, human_gene_id, cognate_gene_id, genome_db_id, source] = row species = genome_db_id2species (cursor, genome_db_id) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) human_stable_id = gene2stable(cursor, human_gene_id) switch_to_db (cursor, ensembl_db_name[species]) cognate_stable_id = gene2stable(cursor, cognate_gene_id) print >>of, orthos_tabstring ([human_stable_id, cognate_stable_id, species, trivial_name[species]]) of.close() cursor.close() db .close()
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db(cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) in_path = cfg.get_path('afs_dumps') in_path += "/para_dump" if (not os.path.exists(in_path)): print in_path, "not found" sys.exit(1) # exit on non-existent outdir ############### if 1: qry = "drop table paralog" search_db(cursor, qry) qry = "create table paralog (id int(10) primary key auto_increment) " search_db(cursor, qry) qry = "alter table paralog ADD gene_id1 varchar(30) " search_db(cursor, qry) qry = "alter table paralog ADD gene_id2 varchar(30) " search_db(cursor, qry) create_index(cursor, db_name, 'gene_id_index', 'paralog', ['gene_id1', 'gene_id2']) ############### os.chdir(in_path) filenames = glob.glob("*_para_dump.txt") ############### for infile in filenames: print infile store(cursor, infile) cursor.close() db.close()
def main(): db = connect_to_mysql() acg = AlignmentCommandGenerator() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) # human and mouse are the only two species that have CCDs info for species in [ 'homo_sapiens', 'mus_musculus']: alt_splice_almt (cursor, cfg, acg, species, ensembl_db_name) cursor.close() db .close()
def main(): no_threads = 1 special = 'test' if len(sys.argv) > 1 and len(sys.argv) < 3: print "usage: %s <set name> <number of threads>" % sys.argv[0] exit(1) elif len(sys.argv) == 3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) print '=======================================' print sys.argv[0] if special: print "using", special, "set" if special == 'complement': gene_list = get_complement_ids(cursor, ensembl_db_name, cfg) else: gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special) else: print "using all protein coding genes" switch_to_db(cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize(no_threads, exon_cleanup, gene_list, [local_db, ensembl_db_name]) return True
def make_alignments (species_list, db_info): [local_db, ensembl_db_name] = db_info verbose = False flank_length = 10 db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) max_days = 60 for species in species_list: species_shorthand = get_species_shorthand(cursor, species) print(species, species_shorthand) directory = check_directory (cfg, species, species_shorthand, "pep") if not directory: continue removed = 0 remaining = 0 for dirname, dirnames, filenames in os.walk(directory): for filename in filenames: full_name = os.path.join(dirname, filename) time_modified = os.path.getmtime(full_name) number_of_days_since_modified = (time.time() - time_modified)/(60*60*24) if number_of_days_since_modified > max_days: #print "removing", filename, "made", number_of_days_since_modified, "ago" os.remove(full_name) else: remaining += 1 print(species, "done, removed", removed, "files, remaining", remaining)
def main(): no_threads = 1 local_db = False db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) outdir = "{0}/para_dump".format(cfg.dir_path['afs_dumps']) print outdir if not os.path.exists(outdir): print outdir, "not found" exit(1) # exit after dir existence check cursor.close() db.close() parallelize(no_threads, dump_paralogues, all_species, [local_db, ensembl_db_name, outdir]) return True
def main(): species = 'oryctolagus_cuniculus' no_threads = 1 if len(sys.argv) > 1 and len(sys.argv) < 3: print "usage: %s <species> <number of threads>" % sys.argv[0] exit(1) elif len(sys.argv) == 3: species = sys.argv[1].lower() no_threads = int(sys.argv[2]) local_db = False db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) ensembl_db_name['compara'] = get_compara_name(cursor) print print "running %s for %s " % (sys.argv[0], species) print switch_to_db(cursor, ensembl_db_name[species]) gene_list = get_gene_ids(cursor, biotype='protein_coding') cursor.close() db.close() parallelize(no_threads, annotate, gene_list, [local_db, all_species, ensembl_db_name, species]) return True
def dump_exons (species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() out_path = "{0}/exons".format(cfg.get_path('afs_dumps')) if not os.path.exists(out_path): print out_path, "not found" exit (1) # exit on failed output dir check for species in species_list: #if (not species=='homo_sapiens'): # continue outfile = "{0}/{1}_exon_dump.txt".format(out_path, species) of = erropen (outfile,"w") if not of: continue switch_to_db (cursor, ensembl_db_name[species]) if (species=='homo_sapiens'): gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True) else: gene_ids = get_gene_ids (cursor, biotype='protein_coding') source = get_analysis_dict(cursor) ct = 0 for gene_id in gene_ids: ct += 1 if (not ct%1000): print species, ct, len(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for ', gene_id continue for exon in exons: if exon.covering_exon > 0: continue # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): continue # human readable string describing the source of annotation for this exon if exon.is_known==2: analysis = 'sw_sharp' elif exon.is_known==3: analysis = 'usearch' else: analysis = source[exon.analysis_id] # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it gene_stable_id = gene2stable(cursor,gene_id) if ( exon.is_known == 1): exon_stable_id = exon2stable(cursor,exon.exon_id) elif ( exon.is_known == 2): exon_stable_id = 'sw_sharp_'+str(exon.exon_id) elif ( exon.is_known == 3): exon_stable_id = 'usearch_'+str(exon.exon_id) else: exon_stable_id = "anon" print >> of, exon_tabstring (exon, gene_stable_id, exon_stable_id, species, analysis, exon_seqs[1:]) of.close() print species, "done" cursor.close() db .close()
def main(): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) mammals = [ 'ailuropoda_melanoleuca', 'bos_taurus', 'callithrix_jacchus', 'canis_familiaris', 'cavia_porcellus', 'choloepus_hoffmanni', 'dasypus_novemcinctus', 'dipodomys_ordii', 'echinops_telfairi', 'equus_caballus', 'erinaceus_europaeus', 'felis_catus', 'gorilla_gorilla', 'ictidomys_tridecemlineatus', 'loxodonta_africana', 'macaca_mulatta', 'macropus_eugenii', 'microcebus_murinus', 'monodelphis_domestica', 'mus_musculus', 'mustela_putorius_furo', 'myotis_lucifugus', 'nomascus_leucogenys', 'ochotona_princeps', 'ornithorhynchus_anatinus', 'oryctolagus_cuniculus', 'otolemur_garnettii', 'pan_troglodytes', 'pongo_abelii', 'procavia_capensis', 'pteropus_vampyrus', 'rattus_norvegicus', 'sarcophilus_harrisii', 'sorex_araneus', 'sus_scrofa', 'tarsius_syrichta', 'tupaia_belangeri', 'tursiops_truncatus', 'vicugna_pacos' ] for species in all_species: switch_to_db(cursor, ensembl_db_name[species]) print print "** species: ", species known_genes = get_gene_ids(cursor, biotype='protein_coding', is_known=1) print "\t known genes: ", len(known_genes) predicted_genes = get_gene_ids(cursor, biotype='protein_coding', is_known=0) print "\t predicted genes: ", len(predicted_genes) # sanity check: all_genes = get_gene_ids(cursor, biotype='protein_coding') print "\t all genes: ", len(all_genes) # alternative alelles crap (it is crap bcs these are nto # always alleles, but may be different version of a gene on a 'patch' qry = "select count(distinct alt_allele_group_id) from alt_allele" rows = search_db(cursor, qry) print "\t number of allele groups: ", rows[0][0] ortho_stats(cursor, ensembl_db_name[species], species, all_genes) # how often does it happen that one exon does not have # a map while the others do # how many of those can actually be found, and how many are gaps in the seqeunces # (are the gaps in the sequence commesurat withe the coverage?) # ow many are at the scaffold boundary? # how many of those cane be patched by brute force? # which sequences are more patchable? # what are the lessons learned (1) about biology, (2) about tchnology/tools that # we need to find the missing exons? # cursor.close() db.close()
def main(): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) print "using all protein coding genes" switch_to_db(cursor, ensembl_db_name['homo_sapiens']) min_similarity = cfg.get_value('min_accptbl_exon_sim') flank_length = 10 gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1, ref_only=True) new_afas = 0 old_afas = 0 ancient_afas = 0 failed_afas = [] for gene_id in gene_list: switch_to_db(cursor, ensembl_db_name['homo_sapiens']) stable_id = gene2stable(cursor, gene_id) if check_afa_age(cfg, stable_id, max_days=30) == "new": new_afas += 1 continue elif check_afa_age(cfg, stable_id, max_days=300) == "new": old_afas += 1 failed_afas.append(gene_id) continue elif check_afa_age(cfg, stable_id, max_days=1000) == "new": ancient_afas += 1 failed_afas.append(gene_id) continue no_exons = 0 cases_with_no_orthos = 0 no_exon_ids = [] for gene_id in failed_afas: if ((failed_afas.index(gene_id)) % 10 == 0): print failed_afas.index(gene_id), "out of ", len(failed_afas), print " no orthos: ", cases_with_no_orthos canonical_human_exons = get_canonical_coding_exons( cursor, gene_id, ensembl_db_name['homo_sapiens']) if not canonical_human_exons: no_exon_ids.append(gene_id) no_exons += 1 continue if False: # reconstruct per-exon alignments with orthologues mitochondrial = is_mitochondrial(cursor, gene_id) [alnmt_pep, alnmt_dna] = make_exon_alignments(cursor, ensembl_db_name, canonical_human_exons, mitochondrial, min_similarity, flank_length) no_orthos = True for human_exon, almt in alnmt_pep.iteritems(): if (type(almt) is str or len(almt.keys()) <= 1): continue no_orthos = False break if no_orthos: cases_with_no_orthos += 1 continue print print "total genes", len(gene_list) print "new afas", new_afas print "old afas", old_afas print "ancient afas", ancient_afas print print "failure cases" print "\t no exons", no_exons print "\t no orthologues ", cases_with_no_orthos print for gene_id in no_exon_ids: print gene_id for exon in gene2exon_list(cursor, gene_id): print "\t", exon.is_canonical, exon.is_coding cursor.close() db.close()
def main(): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) min_similarity = cfg.get_value('min_accptbl_exon_sim') flank_length = 10 gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True) new_afas = 0 old_afas = 0 ancient_afas = 0 failed_afas = [] for gene_id in gene_list: switch_to_db (cursor, ensembl_db_name['homo_sapiens']) stable_id = gene2stable(cursor, gene_id) if check_afa_age (cfg, stable_id, max_days=30) == "new": new_afas += 1 continue elif check_afa_age (cfg, stable_id, max_days=300) == "new": old_afas += 1 failed_afas.append(gene_id) continue elif check_afa_age (cfg, stable_id, max_days=1000) == "new": ancient_afas += 1 failed_afas.append(gene_id) continue no_exons = 0 cases_with_no_orthos = 0 no_exon_ids = [] for gene_id in failed_afas: if ( (failed_afas.index(gene_id))%10 == 0 ): print failed_afas.index(gene_id), "out of ", len(failed_afas), print " no orthos: ", cases_with_no_orthos canonical_human_exons = get_canonical_coding_exons (cursor, gene_id, ensembl_db_name['homo_sapiens']) if not canonical_human_exons: no_exon_ids.append(gene_id) no_exons += 1 continue if False: # reconstruct per-exon alignments with orthologues mitochondrial = is_mitochondrial(cursor, gene_id) [alnmt_pep, alnmt_dna] = make_exon_alignments(cursor, ensembl_db_name, canonical_human_exons, mitochondrial, min_similarity, flank_length) no_orthos = True for human_exon, almt in alnmt_pep.iteritems(): if ( type(almt) is str or len(almt.keys()) <= 1): continue no_orthos = False break if no_orthos: cases_with_no_orthos += 1 continue print print "total genes", len(gene_list) print "new afas", new_afas print "old afas", old_afas print "ancient afas", ancient_afas print print "failure cases" print "\t no exons", no_exons print "\t no orthologues ", cases_with_no_orthos print for gene_id in no_exon_ids: print gene_id for exon in gene2exon_list(cursor, gene_id): print "\t", exon.is_canonical, exon.is_coding cursor.close() db.close()
def exon_cleanup(gene_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids and common names for each species db all_species, ensembl_db_name = get_species(cursor) mammals = [ 'ailuropoda_melanoleuca', 'bos_taurus', 'callithrix_jacchus', 'canis_familiaris', 'cavia_porcellus', 'choloepus_hoffmanni', 'dasypus_novemcinctus', 'dipodomys_ordii', 'echinops_telfairi', 'equus_caballus', 'erinaceus_europaeus', 'felis_catus', 'gorilla_gorilla', 'ictidomys_tridecemlineatus', 'loxodonta_africana', 'macaca_mulatta', 'macropus_eugenii', 'microcebus_murinus', 'monodelphis_domestica', 'mus_musculus', 'mustela_putorius_furo', 'myotis_lucifugus', 'nomascus_leucogenys', 'ochotona_princeps', 'ornithorhynchus_anatinus', 'oryctolagus_cuniculus', 'otolemur_garnettii', 'pan_troglodytes', 'papio_anubis', 'pongo_abelii', 'procavia_capensis', 'pteropus_vampyrus', 'rattus_norvegicus', 'sarcophilus_harrisii', 'sorex_araneus', 'sus_scrofa', 'tarsius_syrichta', 'tupaia_belangeri', 'tursiops_truncatus', 'vicugna_pacos' ] tot = 0 tot_ok = 0 for human_gene_id in gene_list: switch_to_db(cursor, ensembl_db_name['homo_sapiens']) stable_id = gene2stable(cursor, human_gene_id) description = get_description(cursor, human_gene_id) mitochondrial = is_mitochondrial(cursor, human_gene_id) #print "#############################################" #print human_gene_id, stable_id, get_description (cursor, human_gene_id) human_exons = get_ok_human_exons(cursor, ensembl_db_name, human_gene_id) for human_exon in human_exons: [ exon_seq_id, human_protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq ] = get_exon_seqs(cursor, human_exon.exon_id, 1, ensembl_db_name['homo_sapiens']) human_exon_phase = get_exon_phase(cursor, human_exon.exon_id, 1) first_exon = (human_exons.index(human_exon) == 0) for species in mammals: # maxentscan does not work for fish for table in ['sw_exon', 'usearch_exon']: switch_to_db(cursor, ensembl_db_name[species]) qry = "select * from %s where maps_to_human_exon_id = %d " % ( table, human_exon.exon_id) novel_exons = search_db(cursor, qry) if not novel_exons: #print "human_exon: ", human_exon.exon_id, "no", table, "for", species continue ct = 0 ok = 0 for novel_exon in novel_exons: print "%s: novel exon found in table %s, mapping to human exon %s" % \ (species, table, exon2stable (cursor, human_exon.exon_id, ensembl_db_name['homo_sapiens']) ) ct += 1 has_stop = False has_NNN = False [ novel_exon_id, gene_id, start_in_gene, end_in_gene, maps_to_human_exon_id, exon_seq_id, template_exon_seq_id, template_species, strand, phase, end_phase, has_NNN, has_stop, has_3p_ss, has_5p_ss ] = novel_exon tot += 1 exon_seqs = get_exon_seq_by_db_id( cursor, exon_seq_id, ensembl_db_name[species]) if not exon_seqs: print "exon seqs not found" continue [ exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq ] = exon_seqs len_ok = (pepseq_transl_end - pepseq_transl_start) == len(dna_seq) if not len_ok: # if it is not the case, then make it be so left_flank += dna_seq[:pepseq_transl_start] right_flank = dna_seq[ pepseq_transl_end:] + right_flank dna_seq = dna_seq[ pepseq_transl_start:pepseq_transl_end] pepseq_transl_start = 0 pepseq_transl_end = len(dna_seq) phase_ok = (len(dna_seq) % 3 == 0) if not phase_ok: phase = len(dna_seq) % 3 cds = dna_seq[phase:] pepseq_corrected = Seq(cds).translate().tostring() if pepseq_corrected == protein_seq: left_flank += dna_seq[:phase] dna_seq = dna_seq[phase:] else: cds = dna_seq[:-phase] pepseq_corrected = Seq( cds).translate().tostring() if pepseq_corrected == protein_seq: right_flank += dna_seq[ -phase:] + right_flank dna_seq = dna_seq[:-phase] else: print "no match ..." continue # don't want to shut-off the pipeline here pepseq_transl_start = 0 pepseq_transl_end = len(dna_seq) # retrieve the template template_db_id = species2genome_db_id( cursor, template_species) [templ_exon_seq_id, templ_protein_seq, templ_pepseq_transl_start, templ_pepseq_transl_end, templ_left_flank, templ_right_flank, templ_dna_seq] \ = get_exon_seq_by_db_id (cursor, template_exon_seq_id, ensembl_db_name[template_species]) correction = 0 phase = 0 end_phase = 0 # if this is the first exon, check if we are starting from methionine if first_exon: [left_flank_ok, correction, phase] = \ check_translation_start (mitochondrial, left_flank, dna_seq, templ_dna_seq, templ_protein_seq) # see if the left splice site is ok else: [left_flank_ok, correction, phase, max_score] = \ check_left_flank (acg, left_flank, dna_seq, templ_dna_seq) ######################## # # see if the right splice site is ok [right_flank_ok, end_correction, end_phase, end_max_score] = \ check_right_flank(acg, right_flank, dna_seq, templ_dna_seq) pepseq_corrected = "" new_left_flank = "" new_right_flank = "" new_dna_seq = "" if left_flank_ok: offset = (3 - phase) % 3 if correction: if correction > 0: new_dna_seq = dna_seq[correction:] new_left_flank = left_flank + dna_seq[: correction] else: # correction is negative, therefore left_flank[correction:] is the tail of left_flank new_dna_seq = left_flank[ correction:] + dna_seq new_left_flank = left_flank[:correction] else: new_left_flank = left_flank pepseq_transl_start = offset else: new_left_flank = left_flank if right_flank_ok: if not new_dna_seq: new_dna_seq = dna_seq if end_correction: if end_correction < 0: new_right_flank = new_dna_seq[ end_correction:] + right_flank new_dna_seq = new_dna_seq[:end_correction] else: # correction is negative, therefore right_flank[correction:] is the tail of right_flank new_right_flank = right_flank[ end_correction:] new_dna_seq += right_flank[:end_correction] else: new_right_flank = right_flank pepseq_transl_end = len(new_dna_seq) pepseq_transl_end -= end_phase else: new_right_flank = right_flank # if only one flank is ok, use that side to decide if there is a phase on the other if left_flank_ok and not right_flank_ok: end_phase = (pepseq_transl_end - pepseq_transl_start) % 3 pepseq_transl_end -= end_phase if right_flank_ok and not left_flank_ok: phase = (pepseq_transl_end - pepseq_transl_start) % 3 pepseq_transl_start += phase # check that the lengths match has_stop = None if new_dna_seq: len_old = len(left_flank + dna_seq + right_flank) len_new = len(new_left_flank + new_dna_seq + new_right_flank) if not len_old == len_new: print len_old, len_new print correction, end_correction print map(len, [left_flank, dna_seq, right_flank]) print map(len, [ new_left_flank, new_dna_seq, new_right_flank ]) continue cds = new_dna_seq[ pepseq_transl_start:pepseq_transl_end] if mitochondrial: pepseq_corrected = Seq(cds).translate( table="Vertebrate Mitochondrial").tostring( ) else: pepseq_corrected = Seq( cds).translate().tostring() if '*' in pepseq_corrected: has_stop = 1 else: has_stop = 0 if has_stop and not '*' in protein_seq: continue # abort, abort if True: print "#############################################" print human_gene_id, stable_id, "exo no:", human_exons.index( human_exon), " ", description print species, table print "\t template", template_exon_seq_id, template_species, template_db_id print "\t template left flank", templ_left_flank, templ_dna_seq[ 0:3] print "\t left flank", left_flank, dna_seq[ 0:3] print "\t ", left_flank_ok, correction, phase, if not first_exon: print max_score else: print print "\t template right flank", templ_dna_seq[ -3:], templ_right_flank print "\t right flank", dna_seq[ -3:], right_flank print "\t ", right_flank_ok, end_correction, end_phase, end_max_score print "\t human", human_protein_seq, human_exon.exon_id, human_exon_phase print "\t template", templ_protein_seq print "\t deposited", protein_seq if pepseq_corrected: print "\t corrected", pepseq_corrected if new_dna_seq: if (pepseq_transl_end - pepseq_transl_start) % 3: print "length not divisible by 3 " print pepseq_transl_start, pepseq_transl_end print phase, end_phase print len(new_dna_seq) print "%%%%% " continue else: new_dna_seq = dna_seq ######################################################### # 18_find_exons is sometimes messing up the coordinates # I do not know why ret = check_coordinates_in_the_gene( cursor, cfg, acg, ensembl_db_name, species, novel_exon, new_dna_seq) if not ret: print "\t coordinate check failed" continue [start_in_gene_corrected, end_in_gene_corrected] = ret ######################################################### # update the *_exon and exon_seq tables accordingly switch_to_db(cursor, ensembl_db_name[species]) qry = "update %s set " % table set_fields = "" if not start_in_gene_corrected == start_in_gene: if set_fields: set_fields += ", " set_fields += " start_in_gene = %d " % start_in_gene_corrected if not end_in_gene_corrected == end_in_gene: if set_fields: set_fields += ", " set_fields += " end_in_gene = %d " % end_in_gene_corrected if not has_stop is None: if set_fields: set_fields += ", " set_fields += " has_stop = %d" % has_stop if left_flank_ok: if set_fields: set_fields += ", " set_fields += " phase = %d, " % phase if first_exon: set_fields += " has_3p_ss = '%s' " % ( "first exon; starts with M") else: set_fields += " has_3p_ss = '%s' " % ( "me_score=" + str(max_score)) if right_flank_ok: if set_fields: set_fields += ", " set_fields += " end_phase = %d, " % end_phase set_fields += " has_5p_ss = '%s' " % ( "me_score=" + str(end_max_score)) qry += set_fields + " where exon_id=%d" % novel_exon_id if set_fields: search_db(cursor, qry) # update exon sequence if pepseq_corrected: # we might have changed our mind as to what is the cDNA seq, and what is flanking qry = "update exon_seq set " qry += " protein_seq = '%s', " % pepseq_corrected qry += " dna_seq = '%s', " % new_dna_seq qry += " left_flank = '%s', " % new_left_flank qry += " right_flank = '%s', " % new_right_flank qry += " pepseq_transl_start = %d, " % pepseq_transl_start qry += " pepseq_transl_end = %d " % pepseq_transl_end table_id = 2 if table == 'novel_exon' else 3 qry += " where exon_id=%d and is_known=%d" % ( novel_exon_id, table_id) search_db(cursor, qry) # gene2exon --> have to go back to 07_gene2exon for that tot_ok += 1 print "gene list done" cursor.close() db.close()
def dump_exons(species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() out_path = "{0}/exons".format(cfg.get_path('afs_dumps')) if not os.path.exists(out_path): print out_path, "not found" exit(1) # exit on failed output dir check for species in species_list: #if (not species=='homo_sapiens'): # continue outfile = "{0}/{1}_exon_dump.txt".format(out_path, species) of = erropen(outfile, "w") if not of: continue switch_to_db(cursor, ensembl_db_name[species]) if (species == 'homo_sapiens'): gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1, ref_only=True) else: gene_ids = get_gene_ids(cursor, biotype='protein_coding') source = get_analysis_dict(cursor) ct = 0 for gene_id in gene_ids: ct += 1 if (not ct % 1000): print species, ct, len(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for ', gene_id continue for exon in exons: if exon.covering_exon > 0: continue # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): continue # human readable string describing the source of annotation for this exon if exon.is_known == 2: analysis = 'sw_sharp' elif exon.is_known == 3: analysis = 'usearch' else: analysis = source[exon.analysis_id] # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it gene_stable_id = gene2stable(cursor, gene_id) if (exon.is_known == 1): exon_stable_id = exon2stable(cursor, exon.exon_id) elif (exon.is_known == 2): exon_stable_id = 'sw_sharp_' + str(exon.exon_id) elif (exon.is_known == 3): exon_stable_id = 'usearch_' + str(exon.exon_id) else: exon_stable_id = "anon" print >> of, exon_tabstring(exon, gene_stable_id, exon_stable_id, species, analysis, exon_seqs[1:]) of.close() print species, "done" cursor.close() db.close()
def find_missing_exons(human_gene_list, db_info): # [local_db, ensembl_db_name, method] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids and common names for each species db all_species, ensembl_db_name = get_species(cursor) # minimal acceptable similarity between exons min_similarity = cfg.get_value('min_accptbl_exon_sim') switch_to_db(cursor, ensembl_db_name['homo_sapiens']) ################################################################################## # loop over human genes gene_ct = 0 found = 0 sought = 0 unsequenced = 0 #human_gene_list.reverse() for human_gene_id in human_gene_list: switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # Get stable id and description of this gene -- DEBUG human_stable = gene2stable(cursor, human_gene_id) human_description = get_description(cursor, human_gene_id) if verbose: print(human_gene_id, human_stable, human_description) # progress counter gene_ct += 1 if (not gene_ct % 10): print("processed ", gene_ct, " out of ", len(human_gene_list), "genes") print("exons found: ", found, " out of ", sought, "sought") # find all human exons for this gene that we are tracking in the database human_exons = [ e for e in gene2exon_list(cursor, human_gene_id) if e.covering_exon < 0 and e.is_canonical and e.is_known ] if not human_exons: print("\t\t no exons found") continue human_exons.sort(key=lambda exon: exon.start_in_gene) for he in human_exons: he.stable_id = exon2stable(cursor, he.exon_id) ################################################################################## ################################################################################## # make 'table' of maps, which is either pointer to the map if it exists, or None map_table = {} for species in all_species: map_table[species] = {} for he in human_exons: map_table[species][he] = None ################# maps_for_exon = {} for he in human_exons: maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data for m in maps_for_exon[he]: #if m.source == 'usearch': continue #if m.source == 'sw_sharp': continue #if m.source == 'sw_sharp': # print 'sw_sharp' #if m.source == 'usearch': # print 'usearch', m.similarity, m.species_2, m.exon_id_1, m.exon_id_2 if m.similarity < min_similarity: continue m_previous = map_table[m.species_2][he] if m_previous and m_previous.similarity > m.similarity: continue map_table[m.species_2][he] = m # get rid of species that do not have the gene at all for species in all_species: one_exon_found = False for he in human_exons: if map_table[species][he]: one_exon_found = True break if not one_exon_found: del map_table[species] # fill in the peptide sequence field for each human exon # get rid of exons that appear in no other species but human (?) bad_he = [] for he in human_exons: one_species_found = False he.pepseq = get_exon_pepseq(cursor, he, ensembl_db_name['homo_sapiens']) if len( he.pepseq ) < 3: # can I ever get rid of all the nonsense I find in Ensembl? bad_he.append(he) continue for species in list(map_table.keys()): if species == 'homo_sapiens': continue if map_table[species][he]: one_species_found = True break if not one_species_found: bad_he.append(he) human_exons = [he for he in human_exons if not he in bad_he] # keep track of nearest neighbors for each human exon previous = {} next = {} prev = None for he in human_exons: previous[he] = prev if prev: next[prev] = he prev = he next[he] = None # fill, starting from the species that are nearest to the human if not list(map_table.keys()): continue # whatever species_sorted_from_human = species_sort(cursor, list(map_table.keys()), species)[1:] for species in species_sorted_from_human: print(species) # see which exons have which neighbors #if verbose: print he.exon_id, species no_left = [] no_right = [] has_both_neighbors = [] one_existing_map = None for he in human_exons: m = map_table[species][he] if m and not m.warning: # the one existing map should not be a problematic one one_existing_map = m continue prev = previous[he] nxt = next[he] if prev and nxt and map_table[species][prev] and map_table[ species][nxt]: has_both_neighbors.append(he) elif not prev or not map_table[species][prev]: no_left.append(he) elif not nxt or not map_table[species][nxt]: no_right.append(he) if not one_existing_map: continue # this shouldn't happen if not has_both_neighbors and not no_left and not no_right: continue # what is the gene that we are talking about? exon_id = one_existing_map.exon_id_2 is_known = one_existing_map.exon_known_2 gene_id = exon_id2gene_id(cursor, ensembl_db_name[species], exon_id, is_known) # is it mitochondrial? mitochondrial = is_mitochondrial(cursor, gene_id, ensembl_db_name[species]) # where is the gene origin (position on the sequence) gene_coords = get_gene_coordinates(cursor, gene_id, ensembl_db_name[species]) if not gene_coords: continue [gene_seq_region_id, gene_start, gene_end, gene_strand] = gene_coords # fill in exons that have both neighbors: # human exon functions as a coordinate here for he in has_both_neighbors: # get template (known exon from the nearest species) template_info = get_template(cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file # get previous region prev_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # get following region next_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue sought += 1 reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply == 'NNN': unsequenced += 1 # work backwards # use the last known region on the left as the bound no_left.reverse() next_seq_region = None for he in no_left: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template(cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not next_seq_region: next_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue # otherwise it is the last thing we found # the previous region is eyeballed from the next on # the previous and the next region frame the search region prev_seq_region = left_region(next_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply == 'NNN': unsequenced += 1 # repeat the whole procedure on the right prev_seq_region = None for he in no_right: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template(cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not prev_seq_region: prev_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # otherwise it is the last thing we found # the following region is eyeballed from the previous next_seq_region = right_region(prev_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply == 'NNN': unsequenced += 1 print(species, "sought", sought, " unseq", unsequenced)
def main(): no_threads = 1 special = None if len(sys.argv) > 1 and len(sys.argv) < 3: print "usage: %s <set name> <number of threads> " % sys.argv[0] exit(1) elif len(sys.argv) == 3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species(cursor) species = 'homo_sapiens' switch_to_db(cursor, ensembl_db_name[species]) if special: print "using", special, "set" gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special) else: print "using all protein coding genes" switch_to_db(cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1) incomplete = 0 genes_checked = 0 #for gene_id in gene_list: #for gene_id in [743609]: for sampling_count in range(1000): gene_id = choice(gene_list) genes_checked += 1 with_map = 0 tot = 0 switch_to_db(cursor, ensembl_db_name['homo_sapiens']) print gene2stable(cursor, gene_id), get_description(cursor, gene_id) # find all exons we are tracking in the database human_exons = gene2exon_list(cursor, gene_id) human_exons.sort(key=lambda exon: exon.start_in_gene) has_a_map = False for human_exon in human_exons: if (not human_exon.is_canonical or not human_exon.is_coding): continue if verbose: print print "\t human", human_exon.exon_id, human_exon.is_known print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens']) print "\t checking maps ..." maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) tot += 1 if maps: has_a_map = True with_map += 1 #print "ok" else: print "no maps for exon", human_exon.exon_id continue if verbose: for map in maps: species = map.species_2 exon = map2exon(cursor, ensembl_db_name, map) unaligned_sequence = get_exon_pepseq( cursor, exon, ensembl_db_name[species]) if (map.similarity): print "\t", species, map.source, map.exon_id_2, map.exon_known_2 print "\tmaps to ", map.exon_id_1, map.exon_known_1 print "\tsim", map.similarity, print "\tsource", map.source print "\t", unaligned_sequence if not map.bitmap: print "\t bitmap not assigned" else: bs = Bits(bytes=map.bitmap) reconst_pepseq = '' if (not bs.count(1) == len(unaligned_sequence)): print "\talnd seq mismatch" else: usi = iter(unaligned_sequence) for c in bs.bin: if c == '0': reconst_pepseq += '-' else: reconst_pepseq += next(usi) print "\tbinary : ", bs.bin print "\talnd seq: ", reconst_pepseq print if not tot == with_map: print "#### gene id: %d total exons: %d with map: %d ( = %d%%) " % \ (gene_id, tot, with_map, int(float(with_map)/tot*100) ) incomplete += 1 print "genes checked: %d, incomplete: %d" % (genes_checked, incomplete) cursor.close() db.close() print tot, with_map
def find_missing_exons(human_gene_list, db_info): # [local_db, ensembl_db_name, method] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids and common names for each species db all_species, ensembl_db_name = get_species (cursor) # minimal acceptable similarity between exons min_similarity = cfg.get_value('min_accptbl_exon_sim') switch_to_db (cursor, ensembl_db_name['homo_sapiens']) ################################################################################## # loop over human genes gene_ct = 0 found = 0 sought = 0 unsequenced = 0 #human_gene_list.reverse() for human_gene_id in human_gene_list: switch_to_db (cursor, ensembl_db_name['homo_sapiens']) # Get stable id and description of this gene -- DEBUG human_stable = gene2stable (cursor, human_gene_id) human_description = get_description(cursor, human_gene_id) if verbose: print human_gene_id, human_stable, human_description # progress counter gene_ct += 1 if (not gene_ct%10): print "processed ", gene_ct, " out of ", len(human_gene_list), "genes" print "exons found: ", found, " out of ", sought, "sought" # find all human exons for this gene that we are tracking in the database human_exons = [e for e in gene2exon_list(cursor, human_gene_id) if e.covering_exon < 0 and e.is_canonical and e.is_known] if not human_exons: print "\t\t no exons found" continue human_exons.sort(key=lambda exon: exon.start_in_gene) for he in human_exons: he.stable_id = exon2stable (cursor, he.exon_id) ################################################################################## ################################################################################## # make 'table' of maps, which is either pointer to the map if it exists, or None map_table = {} for species in all_species: map_table[species] = {} for he in human_exons: map_table[species][he] = None ################# maps_for_exon = {} for he in human_exons: maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data for m in maps_for_exon[he]: #if m.source == 'usearch': continue #if m.source == 'sw_sharp': continue #if m.source == 'sw_sharp': # print 'sw_sharp' #if m.source == 'usearch': # print 'usearch', m.similarity, m.species_2, m.exon_id_1, m.exon_id_2 if m.similarity < min_similarity: continue m_previous = map_table[m.species_2][he] if m_previous and m_previous.similarity > m.similarity: continue map_table[m.species_2][he] = m # get rid of species that do not have the gene at all for species in all_species: one_exon_found = False for he in human_exons: if map_table[species][he]: one_exon_found = True break if not one_exon_found: del map_table[species] # fill in the peptide sequence field for each human exon # get rid of exons that appear in no other species but human (?) bad_he = [] for he in human_exons: one_species_found = False he.pepseq = get_exon_pepseq (cursor, he, ensembl_db_name['homo_sapiens']) if len (he.pepseq) < 3: # can I ever get rid of all the nonsense I find in Ensembl? bad_he.append(he) continue for species in map_table.keys(): if species =='homo_sapiens': continue if map_table[species][he]: one_species_found = True break if not one_species_found: bad_he.append(he) human_exons = filter (lambda he: not he in bad_he, human_exons) # keep track of nearest neighbors for each human exon previous = {} next = {} prev = None for he in human_exons: previous[he] = prev if prev: next[prev] = he prev = he next[he] = None # fill, starting from the species that are nearest to the human if not map_table.keys(): continue # whatever species_sorted_from_human = species_sort(cursor,map_table.keys(),species)[1:] for species in species_sorted_from_human: print species # see which exons have which neighbors #if verbose: print he.exon_id, species no_left = [] no_right = [] has_both_neighbors = [] one_existing_map = None for he in human_exons: m = map_table[species][he] if m and not m.warning: # the one existing map should not be a problematic one one_existing_map = m continue prev = previous[he] nxt = next[he] if prev and nxt and map_table[species][prev] and map_table[species][nxt]: has_both_neighbors.append(he) elif not prev or not map_table[species][prev]: no_left.append(he) elif not nxt or not map_table[species][nxt]: no_right.append(he) if not one_existing_map: continue # this shouldn't happen if not has_both_neighbors and not no_left and not no_right: continue # what is the gene that we are talking about? exon_id = one_existing_map.exon_id_2 is_known = one_existing_map.exon_known_2 gene_id = exon_id2gene_id (cursor, ensembl_db_name[species], exon_id, is_known) # is it mitochondrial? mitochondrial = is_mitochondrial(cursor, gene_id, ensembl_db_name[species]) # where is the gene origin (position on the sequence) gene_coords = get_gene_coordinates (cursor, gene_id, ensembl_db_name[species]) if not gene_coords: continue [gene_seq_region_id, gene_start, gene_end, gene_strand] = gene_coords # fill in exons that have both neighbors: # human exon functions as a coordinate here for he in has_both_neighbors: # get template (known exon from the nearest species) template_info = get_template (cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file # get previous region prev_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # get following region next_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue sought += 1 reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply=='NNN': unsequenced += 1 # work backwards # use the last known region on the left as the bound no_left.reverse() next_seq_region = None for he in no_left: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template (cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not next_seq_region: next_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue # otherwise it is the last thing we found # the previous region is eyeballed from the next on # the previous and the next region frame the search region prev_seq_region = left_region (next_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply=='NNN': unsequenced += 1 # repeat the whole procedure on the right prev_seq_region = None for he in no_right: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template (cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not prev_seq_region: prev_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # otherwise it is the last thing we found # the following region is eyeballed from the previous next_seq_region = right_region (prev_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply=='NNN': unsequenced += 1 print species, "sought", sought, " unseq", unsequenced
def multiple_exon_alnmt(gene_list, db_info): print "process pid: %d, length of gene list: %d" % ( get_process_id(), len(gene_list)) [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) # for each human gene gene_ct = 0 tot = 0 ok = 0 no_maps = 0 no_pepseq = 0 no_orthologues = 0 min_similarity = cfg.get_value('min_accptbl_exon_sim') #gene_list.reverse() for gene_id in gene_list: start = time() gene_ct += 1 if not gene_ct%10: print gene_ct, "genes out of", len(gene_list) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) print gene_ct, len(gene_ids), gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) human_exons = filter (lambda e: e.is_known==1 and e.is_coding and e.covering_exon<0, gene2exon_list(cursor, gene_id)) human_exons.sort(key=lambda exon: exon.start_in_gene) ################################################################## for human_exon in human_exons: tot += 1 # find all orthologous exons the human exon maps to maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) if verbose: print "\texon no.", tot, " id", human_exon.exon_id, if not maps: print " no maps" print human_exon print if not maps: no_maps += 1 continue # human sequence to fasta: seqname = "{0}:{1}:{2}".format('homo_sapiens', human_exon.exon_id, human_exon.is_known) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = get_exon_seqs (cursor, human_exon.exon_id, human_exon.is_known) if (not pepseq): if verbose and human_exon.is_coding and human_exon.covering_exon <0: # this should be a master exon print "no pep seq for", human_exon.exon_id, "coding ", human_exon.is_coding, print "canonical: ", human_exon.is_canonical print "length of dna ", len(dna_seq) no_pepseq += 1 continue # collect seq from all maps, and output them in fasta format hassw = False headers = [] sequences = {} exons_per_species = {} for map in maps: switch_to_db (cursor, ensembl_db_name[map.species_2]) if map.similarity < min_similarity: continue exon = map2exon(cursor, ensembl_db_name, map) pepseq = get_exon_pepseq (cursor,exon) if (not pepseq): continue if map.source == 'sw_sharp': exon_known_code = 2 hassw = True elif map.source == 'usearch': exon_known_code = 3 hassw = True else: exon_known_code = map.exon_known_2 seqname = "{0}:{1}:{2}".format(map.species_2, map.exon_id_2, exon_known_code) headers.append(seqname) sequences[seqname] = pepseq # for split exon concatenation (see below) if not map.species_2 in exons_per_species.keys(): exons_per_species[map.species_2] = [] exons_per_species[map.species_2].append ([ map.exon_id_2, exon_known_code]); if (len(headers) <=1 ): if verbose: print "single species in the alignment" no_orthologues += 1 continue # concatenate exons from the same gene - the alignment program might go wrong otherwise concatenated = concatenate_exons (cursor, ensembl_db_name, sequences, exons_per_species) fasta_fnm = "{0}/{1}.fa".format( cfg.dir_path['scratch'], human_exon.exon_id) output_fasta (fasta_fnm, sequences.keys(), sequences) # align afa_fnm = "{0}/{1}.afa".format( cfg.dir_path['scratch'], human_exon.exon_id) mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm) ret = commands.getoutput(mafftcmd) if (verbose): print 'almt to', afa_fnm # read in the alignment inf = erropen(afa_fnm, "r") aligned_seqs = {} for record in SeqIO.parse(inf, "fasta"): aligned_seqs[record.id] = str(record.seq) inf.close() # split back the concatenated exons if concatenated: split_concatenated_exons (aligned_seqs, concatenated) human_seq_seen = False for seq_name, sequence in aligned_seqs.iteritems(): # if this is one of the concatenated seqs, split them back to two ### store the alignment as bitstring # Generate the bitmap bs = Bits(bin='0b' + re.sub("[^0]","1", sequence.replace('-','0'))) # The returned value of tobytes() will be padded at the end # with between zero and seven 0 bits to make it byte aligned. # I will end up with something that looks like extra alignment gaps, that I'll have to return msa_bitmap = bs.tobytes() # Retrieve information on the cognate cognate_species, cognate_exon_id, cognate_exon_known = seq_name.split(':') if cognate_exon_known == '2': source = 'sw_sharp' elif cognate_exon_known == '3': source = 'usearch' else: source = 'ensembl' if (cognate_species == 'homo_sapiens'): human_seq_seen = True cognate_genome_db_id = species2genome_db_id(cursor, cognate_species) # moves the cursor switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # so move it back to h**o sapiens # Write the bitmap to the database #if (cognate_species == 'homo_sapiens'): if verbose: # and (source=='sw_sharp' or source=='usearch'): print "storing" print human_exon.exon_id, human_exon.is_known print cognate_species, cognate_genome_db_id, cognate_exon_id, cognate_exon_known, source print sequence if not msa_bitmap: print "no msa_bitmap" continue store_or_update(cursor, "exon_map", {"cognate_genome_db_id":cognate_genome_db_id, "cognate_exon_id":cognate_exon_id ,"cognate_exon_known" :cognate_exon_known, "source": source, "exon_id" :human_exon.exon_id, "exon_known":human_exon.is_known}, {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)}) ok += 1 commands.getoutput("rm "+afa_fnm+" "+fasta_fnm) if verbose: print " time: %8.3f\n" % (time()-start); print "tot: ", tot, "ok: ", ok print "no maps ", no_pepseq print "no pepseq ", no_pepseq print "no orthologues ", no_orthologues print
def main(): special = None no_threads = 1 db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) if special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) # loop over all genes sw_count = 0 tot_count = 0 for human_gene_id in gene_list: switch_to_db (cursor, ensembl_db_name['homo_sapiens']) human_stable = gene2stable (cursor, human_gene_id) human_description = get_description(cursor, human_gene_id) tot_count += 1 #print human_gene_id, human_stable, human_description human_exons = [e for e in gene2exon_list(cursor, human_gene_id, verbose=True) if e.covering_exon < 0 and e.is_canonical and e.is_known] if not human_exons: #print "\t\t", human_stable, "no exons found" continue human_exons.sort(key=lambda exon: exon.start_in_gene) # loop over all exons in this gene maps_for_exon = {} for he in human_exons: he.stable_id = exon2stable (cursor, he.exon_id, ensembl_db_name['homo_sapiens']) he.pepseq = get_exon_pepseq (cursor, he, ensembl_db_name['homo_sapiens']) # maps cleanup: get rid of maps that have "none" as similarity maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data if not maps_for_exon[he]: continue #maps_for_exon[he] = filter (lambda m: m.source == 'sw_sharp' or m.source == 'usearch', # maps_for_exon[he]) maps_for_exon[he] = filter (lambda m: m.source == 'usearch', maps_for_exon[he]) if not maps_for_exon[he]: #print "\t\t", human_stable, "no maps found" continue sw_count += len(maps_for_exon[he]) #break print "tot count: ", tot_count print "sw count: ", sw_count #print "tot count: ", tot_count #print "sw count: ", sw_count cursor.close() db.close()