def collect_orthologues(gene_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) db_human = connect_to_mysql() cursor_human = db_human.cursor() switch_to_db (cursor_human, ensembl_db_name['homo_sapiens']) ensembl_compara_name = get_compara_name(cursor) print ensembl_compara_name db_compara = connect_to_mysql() cursor_compara = db_compara.cursor() switch_to_db (cursor_compara, ensembl_compara_name) ortho_table = {} ortho_table ['ortholog_one2one'] = 'orthologue' ortho_table ['apparent_ortholog_one2one'] = 'orthologue' ortho_table ['possible_ortholog'] = 'unresolved_ortho' ortho_table ['ortholog_one2many'] = 'unresolved_ortho' ortho_table ['ortholog_many2many'] = 'unresolved_ortho' ct = 0 for gene_id in gene_list: ct += 1 # find stable stable_id = gene2stable(cursor_human, gene_id=gene_id) # memebr id refers to entries in compara db member_id = stable2member(cursor_compara, stable_id) #print gene_id, stable_id, member_id if ( not ct%100): print ct , "out of ", len(gene_list) # in compara table, get everything that homology has to say about # the possible orthologues # find all orthologous pairs suggested for this gene for ortho_type in ['ortholog_one2one','possible_ortholog', 'apparent_ortholog_one2one', 'ortholog_one2many','ortholog_many2many']: orthos = get_orthologues(cursor_compara, ortho_type, member_id) if ( orthos): store_orthologues (cursor_human, ortho_table[ortho_type], cursor, all_species, ensembl_db_name, gene_id, orthos) cursor.close() db.close() cursor_human.close() db_human.close() cursor_compara.close() db_compara.close()
def main (): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader (user="******", passwd="tooiram", check=False) inpath = cfg.get_path('afs_dumps') indir = "%s/exon_map" % inpath infile = "%s/exon_map.sql" % indir if (not os.path.exists(infile)): print "not found: ", infile sys.exit(1) print "reading", infile qry = "drop table exon_map" rows = search_db(cursor, qry) # I could not get this to run, though it runs fine directly from the mysql shell: #qry = "source %s" % infile #rows = search_db(cursor, qry, verbose=True) cursor.close() db.close() credentials = " -u marioot -ptooiram" cmd = "mysql %s exolocator_db < %s" % (credentials, infile) print cmd ret = commands.getoutput(cmd) print ret return True
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) for species in all_species: print species switch_to_db (cursor, ensembl_db_name[species]) qry = "select seq_region.name, seq_region.file_name from seq_region, gene " qry += " where gene.biotype='protein_coding' and gene.seq_region_id = seq_region.seq_region_id " rows = search_db (cursor, qry) if (not rows): print "\t no seq region info found " continue tot = 0 no_file = 0 for row in rows: [name, file_name] = row #print name, file_name tot += 1 if (not file_name): no_file += 1 print name, file_name #exit (1) print "\t tot seq_regions: ", tot, " no file: ", no_file cursor.close() db .close()
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db(cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) inpath = cfg.get_path('afs_dumps') indir = "%s/exon_map" % inpath infile = "%s/exon_map.sql" % indir if (not os.path.exists(infile)): print "not found: ", infile sys.exit(1) print "reading", infile qry = "drop table exon_map" rows = search_db(cursor, qry) # I could not get this to run, though it runs fine directly from the mysql shell: #qry = "source %s" % infile #rows = search_db(cursor, qry, verbose=True) cursor.close() db.close() credentials = " -u marioot -ptooiram" cmd = "mysql %s exolocator_db < %s" % (credentials, infile) print cmd ret = commands.getoutput(cmd) print ret return True
def all_species_all_genes_loop(species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cursor = db.cursor() ##################################### for species in species_list: print print "############################" print species sys.stdout.flush() if not switch_to_db(cursor, ensembl_db_name[species]): return False if (species=='homo_sapiens'): gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) else: gene_ids = get_gene_ids (cursor, biotype='protein_coding') #for all protein coding genes in a species #for gene_id in [10093105]: for gene_id in gene_ids: # for all exons in the gene exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for gene', gene_id continue #################################### pep_seqs(cursor, gene_id, exons) #################################### if not gene_ids.index(gene_id)%1000: print "%50s: %5.1f%% " % (species, 100*(float( gene_ids.index(gene_id) +1 )/len(gene_ids)) ) sys.stdout.flush() print species, "done" cursor.close() db.close()
def main(): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) cursor.close() db.close() outpath = cfg.get_path('afs_dumps') outdir = "{0}/exon_map".format(outpath) if (not os.path.exists(outdir)): mkdir_p(outdir) outfile = "{0}/exon_map.sql".format(outdir) if os.path.exists('.creds'): [user, passwd, host, port] = read_creds() else: print "creds not found" exit(1) credentials = " -h {0} -P {1} -u {2} -p{3}".format( host, port, user, password) cmd = "mysqldump {0} {1} exon_map > {2}".format( credentials, ensembl_db_name['homo_sapiens'], outfile) print cmd ret = commands.getoutput(cmd) print ret return True
def ortologues_for_given_genes_loop (gene_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cursor = db.cursor() ##################################### for gene_id in gene_list: switch_to_db (cursor, ensembl_db_name['homo_sapiens']) orthologues = get_orthos (cursor, gene_id, 'orthologue') # get_orthos changes the db pointer switch_to_db (cursor, ensembl_db_name['homo_sapiens']) orthologues += get_orthos (cursor, gene_id, 'unresolved_ortho') for [ortho_gene_id, ortho_species] in [[gene_id,'homo_sapiens']] + orthologues: print ">>> ", ortho_species, ortho_gene_id switch_to_db (cursor, ensembl_db_name[ortho_species]) # for all exons in the gene exons = gene2exon_list(cursor, ortho_gene_id) if (not exons): if verbose: print 'no exons for gene', ortho_gene_id continue ############################## pep_seqs(cursor, ortho_gene_id, exons) #################################### if not gene_list.index(gene_id)%1000: print "%5.1f%% " % (100*(float( gene_list.index(gene_id) +1 )/len(gene_list)) ) sys.stdout.flush() cursor.close() db.close()
def main(): special = None no_threads = 1 db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) total = 0 for species in all_species: print species switch_to_db (cursor, ensembl_db_name[species]) qry = "select count(1) from usearch_exon" rows = search_db (cursor, qry) count = int(rows[0][0]) print "\t usearch exons: ", count total += count qry = "select count(1) from sw_exon" rows = search_db (cursor, qry) count = int(rows[0][0]) print "\t sw exons: ", count total += count print print 'total: ', total cursor.close() db.close()
def main(): special = None no_threads = 1 db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) total = 0 for species in all_species: print species switch_to_db(cursor, ensembl_db_name[species]) qry = "select count(1) from usearch_exon" rows = search_db(cursor, qry) count = int(rows[0][0]) print "\t usearch exons: ", count total += count qry = "select count(1) from sw_exon" rows = search_db(cursor, qry) count = int(rows[0][0]) print "\t sw exons: ", count total += count print print 'total: ', total cursor.close() db.close()
def main (): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) cursor.close() db .close() outpath = cfg.get_path('afs_dumps') outdir = "{0}/exon_map".format(outpath) if (not os.path.exists(outdir)): mkdir_p(outdir) outfile = "{0}/exon_map.sql".format(outdir) if os.path.exists('.creds'): [user, passwd, host, port] = read_creds() else: print "creds not found" exit(1) credentials = " -h {0} -P {1} -u {2} -p{3}".format(host, port, user, password) cmd = "mysqldump {0} {1} exon_map > {2}".format (credentials, ensembl_db_name['homo_sapiens'], outfile) print cmd ret = commands.getoutput(cmd) print ret return True
def main(): no_threads = 1 db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) parallelize (no_threads, dump_orthos, all_species, [local_db, ensembl_db_name])
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) # human and mouse are the only two species that have CCDs info for species in ['homo_sapiens']: check_alt_splices(cursor, species, ensembl_db_name) cursor.close() db.close()
def main(): db = connect_to_mysql() cr = ConfigurationReader() cursor = db.cursor() fasta_path = cr.get_path('ensembl_fasta') [all_species, ensembl_db_name] = get_species (cursor) for species in all_species: #for species in ['danio_rerio']: print species dna_path = "{0}/{1}/dna".format(fasta_path, species) if (not os.path.exists(dna_path)): print "problem:", dna_path, "not found" exit(1) fasta_files = [] for r,d,files in os.walk(dna_path): for file in files: if (not file[-3:] == ".fa"): continue fasta_files.append(file) name2file = {} for file in fasta_files: print dna_path, file cmd = "grep '>' {0}/{1}".format(dna_path, file) ret = commands.getoutput(cmd) headers = ret.split("\n") print "number of headers: ", len(headers) for hdr in headers: fields = hdr.split(" ") name = fields[0].replace (">", "") #print name if (not name2file.has_key(name)): name2file[name] = [] name2file[name].append(file) qry = "use "+ensembl_db_name[species] search_db (cursor, qry) for name in name2file.keys(): file_names = "" for file in name2file[name]: if file_names: file_names += " " file_names += file store_seq_filenames (cursor, name, file_names) cursor.close() db .close()
def collect_paralogues(species_list, db_info): [local_db, ensembl_db_name] = db_info db_species = connect_to_mysql() cursor_species = db_species.cursor() ensembl_compara_name = get_compara_name(cursor_species) print ensembl_compara_name db_compara = connect_to_mysql() cursor_compara = db_compara.cursor() switch_to_db (cursor_compara, ensembl_compara_name) for species in species_list: switch_to_db (cursor_species, ensembl_db_name[species]) # it looks I cannot demand that the gene is known, because for many species # most of the genes still have 'predicted' status gene_list = get_gene_ids (cursor_species, biotype='protein_coding') ct = 0 for gene_id in gene_list: ct += 1 # find stable stable_id = gene2stable(cursor_species, gene_id=gene_id) # memebr id refers to entries in compara db member_id = stable2member(cursor_compara, stable_id) #print gene_id, stable_id, member_id if (not ct%100): print species, ct , "out of ", len(gene_list) # find all paralogue pairs suggested for this gene ortho_type = 'within_species_paralog' paralogues = get_orthologues(cursor_compara, ortho_type, member_id) if not paralogues: continue store_paralogues (cursor_species, gene_id, paralogues) print species, 'done' cursor_species.close() db_species.close() cursor_compara.close() db_compara.close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) # human and mouse are the only two species that have CCDs info for species in ['homo_sapiens']: check_alt_splices (cursor, species, ensembl_db_name) cursor.close() db .close()
def main(): no_threads = 10 db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) cursor.close() db.close() parallelize (no_threads, collect_paralogues, all_species, [local_db, ensembl_db_name]) return True
def main(): no_threads = 1 db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) cursor.close() db .close() parallelize (no_threads, make_alignments, all_species, [local_db, ensembl_db_name]) return True
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) if 1: check_genome_sizes(cursor, all_species, ensembl_db_name) if 0: check_table_sizes(cursor, all_species, ensembl_db_name) cursor.close() db.close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) if 1: check_genome_sizes (cursor, all_species, ensembl_db_name) if 0: check_table_sizes (cursor, all_species, ensembl_db_name) cursor.close() db.close()
def main(): no_threads = 10 db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize (no_threads, collect_orthologues, gene_list, [local_db, ensembl_db_name]) return True
def main(): db = connect_to_mysql() acg = AlignmentCommandGenerator() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) # human and mouse are the only two species that have CCDs info for species in [ 'homo_sapiens', 'mus_musculus']: alt_splice_almt (cursor, cfg, acg, species, ensembl_db_name) cursor.close() db .close()
def main(): no_threads = 12 special = None if len(sys.argv) > 1 and len(sys.argv)<3: print "usage: %s <set name> <no of processes>" % sys.argv[0] exit(1) # after usage statement elif len(sys.argv)>=3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_processes = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print "running ", sys.argv[0] if special: print "using", special, "set" if special == 'complement': gene_list = get_complement_ids(cursor, ensembl_db_name, cfg) else: gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize (no_processes, multiple_exon_alnmt, gene_list, [local_db, ensembl_db_name]) return True
def main(): db = connect_to_mysql(Config.mysql_conf_file) cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) tree = Tree() for species in all_species: leaf = Node(species) tree.leafs.append(leaf) tree.build(cursor) print() print(tree.nhx_string()) print() cursor.close() db.close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) tree = Tree() for species in all_species: leaf = Node(species) tree.leafs.append(leaf) tree.build(cursor) print print tree.nhx_string() print cursor.close() db.close()
def main(): no_threads = 1 special = '' db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) species = '' if len(sys.argv) > 1 and len(sys.argv)<3 or len(sys.argv) >= 2 and sys.argv[1]=="-h": print "usage: %s <set name/species> <number of processes>" % sys.argv[0] exit(1) # after usage statement elif len(sys.argv)==3: special = sys.argv[1].lower() if special == 'none': special = None elif special in all_species: species = special no_threads = int(sys.argv[2]) print '=======================================' print sys.argv[0] if species: print species, "only" switch_to_db (cursor, ensembl_db_name[species]) if (species=='homo_sapiens'): gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True) else: gene_ids = get_gene_ids (cursor, biotype='protein_coding') parallelize_args = [no_threads, one_species_all_genes_loop, gene_ids, [local_db, ensembl_db_name, species]] elif special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) parallelize_args = [no_threads, ortologues_for_given_genes_loop, gene_list, [local_db, ensembl_db_name]] else: parallelize_args = [no_threads, all_species_all_genes_loop, all_species, [local_db, ensembl_db_name]] cursor.close() db .close() parallelize (*parallelize_args)
def make_alignments(species_list, db_info): [local_db, ensembl_db_name] = db_info verbose = False flank_length = 10 db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species(cursor) max_days = 60 for species in species_list: if species == "homo_sapiens": species_shorthand = "HSA" else: species_shorthand = get_species_shorthand(cursor, species) print species, species_shorthand directory = check_directory(cfg, species, species_shorthand, "pep") if not directory: continue removed = 0 remaining = 0 for dirname, dirnames, filenames in os.walk(directory): for filename in filenames: full_name = os.path.join(dirname, filename) time_modified = os.path.getmtime(full_name) number_of_days_since_modified = (time.time() - time_modified) / (60 * 60 * 24) if number_of_days_since_modified > max_days: # print "removing", filename, "made", number_of_days_since_modified, "ago" os.remove(full_name) else: remaining += 1 print species, "done, removed", removed, "files, remaining", remaining
def dump_orthos (species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) # in the afa headers use 'trivial' names for the species: cow, dog, pig, ... trivial_name = translate_to_trivial(cursor, all_species) out_path = cfg.get_path('afs_dumps') outfile = "{0}/orthologue_dump.txt".format(out_path) print outfile of = erropen (outfile,"w") species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) qry = "select * from orthologue" rows = search_db (cursor, qry) for row in rows: [pair_id, human_gene_id, cognate_gene_id, genome_db_id, source] = row species = genome_db_id2species (cursor, genome_db_id) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) human_stable_id = gene2stable(cursor, human_gene_id) switch_to_db (cursor, ensembl_db_name[species]) cognate_stable_id = gene2stable(cursor, cognate_gene_id) print >>of, orthos_tabstring ([human_stable_id, cognate_stable_id, species, trivial_name[species]]) of.close() cursor.close() db .close()
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader (user="******", passwd="tooiram", check=False) in_path = cfg.get_path('afs_dumps') in_path += "/para_dump" if (not os.path.exists(in_path)): print in_path, "not found" sys.exit(1) # exit on non-existent outdir ############### if 1: qry = "drop table paralog" search_db (cursor, qry) qry = "create table paralog (id int(10) primary key auto_increment) " search_db (cursor, qry) qry = "alter table paralog ADD gene_id1 varchar(30) " search_db (cursor, qry) qry = "alter table paralog ADD gene_id2 varchar(30) " search_db (cursor, qry) create_index (cursor, db_name,'gene_id_index', 'paralog', ['gene_id1', 'gene_id2']) ############### os.chdir(in_path) filenames = glob.glob("*_para_dump.txt") ############### for infile in filenames: print infile store(cursor, infile) cursor.close() db .close()
def make_alignments (species_list, db_info): [local_db, ensembl_db_name] = db_info verbose = False flank_length = 10 db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) max_days = 60 for species in species_list: species_shorthand = get_species_shorthand(cursor, species) print(species, species_shorthand) directory = check_directory (cfg, species, species_shorthand, "pep") if not directory: continue removed = 0 remaining = 0 for dirname, dirnames, filenames in os.walk(directory): for filename in filenames: full_name = os.path.join(dirname, filename) time_modified = os.path.getmtime(full_name) number_of_days_since_modified = (time.time() - time_modified)/(60*60*24) if number_of_days_since_modified > max_days: #print "removing", filename, "made", number_of_days_since_modified, "ago" os.remove(full_name) else: remaining += 1 print(species, "done, removed", removed, "files, remaining", remaining)
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db(cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) in_path = cfg.get_path('afs_dumps') in_path += "/para_dump" if (not os.path.exists(in_path)): print in_path, "not found" sys.exit(1) # exit on non-existent outdir ############### if 1: qry = "drop table paralog" search_db(cursor, qry) qry = "create table paralog (id int(10) primary key auto_increment) " search_db(cursor, qry) qry = "alter table paralog ADD gene_id1 varchar(30) " search_db(cursor, qry) qry = "alter table paralog ADD gene_id2 varchar(30) " search_db(cursor, qry) create_index(cursor, db_name, 'gene_id_index', 'paralog', ['gene_id1', 'gene_id2']) ############### os.chdir(in_path) filenames = glob.glob("*_para_dump.txt") ############### for infile in filenames: print infile store(cursor, infile) cursor.close() db.close()
def one_species_all_genes_loop(gene_ids, db_info): [local_db, ensembl_db_name, species] = db_info db = connect_to_mysql() cursor = db.cursor() switch_to_db (cursor, ensembl_db_name[species]) #for gene_id in [10092907]: for gene_id in gene_ids: # for all exons in the gene exons = gene2exon_list(cursor, gene_id) if (not exons): if verbose: print 'no exons for gene', gene_id continue #################################### pep_seqs(cursor, gene_id, exons) #################################### if not gene_ids.index(gene_id) % 100: print "\t done with %d out of %d (%5.1f%%) " % (gene_ids.index(gene_id) + 1, len(gene_ids), 100 * (float(gene_ids.index(gene_id) + 1) / len(gene_ids))) sys.stdout.flush() cursor.close() db.close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) switch_to_db(cursor, ensembl_db_name['homo_sapiens']) magical_list = ['APC', 'BUB1', 'BUB1B', 'BUB3', 'C11orf51', 'CDC20', 'CDC27', 'CENPF', 'TERF1', 'TPR', 'TTK', 'UBE2C', 'UBE2D1', 'UBE2E1', 'TP53', 'BCL', ' RAS', ' MIC ', 'actin'] for gene_name in magical_list: description = "" gene_id = gene_name2gene_id(cursor, gene_name) if (not gene_id): [gene_id, description] = search_description (cursor, gene_name) if (not gene_id): continue print gene_name, " ** ", gene_id, description cursor.close() db .close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) switch_to_db(cursor, ensembl_db_name['homo_sapiens']) magical_list = [ 'APC', 'BUB1', 'BUB1B', 'BUB3', 'C11orf51', 'CDC20', 'CDC27', 'CENPF', 'TERF1', 'TPR', 'TTK', 'UBE2C', 'UBE2D1', 'UBE2E1', 'TP53', 'BCL', ' RAS', ' MIC ', 'actin' ] for gene_name in magical_list: description = "" gene_id = gene_name2gene_id(cursor, gene_name) if (not gene_id): [gene_id, description] = search_description(cursor, gene_name) if (not gene_id): continue print(gene_name, " ** ", gene_id, description) cursor.close() db.close()
def main(): local_db = False db = connect_to_mysql() acg = AlignmentCommandGenerator() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) species = 'homo_sapiens' gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) for gene_id in gene_ids: print(gene2stable (cursor, gene_id = gene_id), end=' ') # what is the length of the canonical transcript according to Ensembl canonical_translation = get_canonical_transl (acg, cursor, gene_id, species, strip_X=False) if ( not canonical_translation): print("no canonical transl found for ", gene2stable (cursor, gene_id = gene_id)) continue # find all canonical coding exons associated with the gene id exons = get_canonical_coding_exons (cursor, gene_id) if (not exons): ct +=1 print(gene_id, gene2stable (cursor, gene_id = gene_id), " no exons found ", ct, tot) exit(1) cursor.close() db.close()
def main(): local_db = False db = connect_to_mysql() acg = AlignmentCommandGenerator() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) species = 'homo_sapiens' gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) for gene_id in gene_ids: print gene2stable (cursor, gene_id = gene_id), # what is the length of the canonical transcript according to Ensembl canonical_translation = get_canonical_transl (acg, cursor, gene_id, species, strip_X=False) if ( not canonical_translation): print "no canonical transl found for ", gene2stable (cursor, gene_id = gene_id) continue # find all canonical coding exons associated with the gene id exons = get_canonical_coding_exons (cursor, gene_id) if (not exons): ct +=1 print gene_id, gene2stable (cursor, gene_id = gene_id), " no exons found ", ct, tot exit(1) cursor.close() db.close()
def main(): no_threads = 1 special = None if len(sys.argv) > 1 and len(sys.argv)<3: print "usage: %s <set name> <number of threads> " % sys.argv[0] exit(1) elif len(sys.argv)==3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) if special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) incomplete = 0 genes_checked = 0 #for gene_id in gene_list: #for gene_id in [743609]: for sampling_count in range(1000): gene_id = choice(gene_list) genes_checked += 1 with_map = 0 tot = 0 switch_to_db (cursor, ensembl_db_name['homo_sapiens']) print gene2stable(cursor, gene_id), get_description (cursor, gene_id) # find all exons we are tracking in the database human_exons = gene2exon_list(cursor, gene_id) human_exons.sort(key=lambda exon: exon.start_in_gene) has_a_map = False for human_exon in human_exons: if (not human_exon.is_canonical or not human_exon.is_coding): continue if verbose: print print "\t human", human_exon.exon_id, human_exon.is_known print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens']) print "\t checking maps ..." maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) tot += 1 if maps: has_a_map = True with_map += 1 #print "ok" else: print"no maps for exon", human_exon.exon_id continue if verbose: for map in maps: species = map.species_2 exon = map2exon(cursor, ensembl_db_name, map) unaligned_sequence = get_exon_pepseq(cursor, exon, ensembl_db_name[species]) if ( map.similarity): print "\t", species, map.source, map.exon_id_2, map.exon_known_2 print "\tmaps to ", map.exon_id_1, map.exon_known_1 print "\tsim", map.similarity, print "\tsource", map.source print "\t", unaligned_sequence if not map.bitmap: print "\t bitmap not assigned" else: bs = Bits(bytes=map.bitmap) reconst_pepseq = '' if (not bs.count(1) == len(unaligned_sequence)): print "\talnd seq mismatch" else: usi = iter(unaligned_sequence) for c in bs.bin: if c == '0': reconst_pepseq += '-' else: reconst_pepseq += next(usi) print "\tbinary : ", bs.bin print "\talnd seq: ", reconst_pepseq print if not tot== with_map: print "#### gene id: %d total exons: %d with map: %d ( = %d%%) " % \ (gene_id, tot, with_map, int(float(with_map)/tot*100) ) incomplete += 1 print "genes checked: %d, incomplete: %d" % (genes_checked, incomplete) cursor.close() db.close() print tot, with_map
def main(): if (len(sys.argv) < 2): print "Usage: %s <stable gene id> [<exon1> <exon2> ... ]" % sys.argv[0] exit(1) stable_id = sys.argv[1] species = 'homo_sapiens' selected_exons = sys.argv[2:] db = connect_to_mysql() cursor = db.cursor() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) print species, stable_id, ensembl_db_name[species] switch_to_db(cursor, ensembl_db_name[species]) gene_id = stable2gene(cursor, stable_id) print get_description(cursor, gene_id) print "gene id:", gene_id # find all exons we are tracking in the database human_exons = gene2exon_list(cursor, gene_id) canonical_human_exons = [] for human_exon in human_exons: if not human_exon.is_canonical or not human_exon.is_coding: continue canonical_human_exons.append(human_exon) # the exons are not guaranteed to be in order canonical_human_exons.sort(key=lambda exon: exon.start_in_gene) print "exons:" for exon in canonical_human_exons: if selected_exons and not str(exon.exon_id) in selected_exons: continue switch_to_db(cursor, ensembl_db_name[species]) exon_seqs = get_exon_seqs(cursor, exon.exon_id, 1) [ exon_pep_seq, trsl_from, trsl_to, exon_left_flank, exon_right_flank, exon_dna_seq ] = exon_seqs[1:] print "exon:", exon.exon_id, "covering exon:", exon.covering_exon, "pepseq:", exon_pep_seq if not exon.covering_exon == -1: [ exon_pep_seq_2, trsl_from, trsl_to, exon_left_flank, exon_right_flank, exon_dna_seq ] = get_exon_seqs(cursor, exon.covering_exon, 1)[1:] print "\t", exon.covering_exon, " seq:", exon_pep_seq_2 if 1: print print 'exon_alignments:' maps = get_maps(cursor, ensembl_db_name, exon.exon_id, exon.is_known) if not maps: print "no maps for exon", exon.exon_id else: for map in maps: species_2 = map.species_2 exon_2 = map2exon(cursor, ensembl_db_name, map) unaligned_sequence = get_exon_pepseq( cursor, exon_2, ensembl_db_name[species_2]) if (map.similarity): print "\t", species_2, map.source, map.exon_id_2, map.exon_known_2 print "\tmaps to ", map.exon_id_1, map.exon_known_1 print "\tsim", map.similarity, print "\tsource", map.source print "\t", unaligned_sequence if not map.bitmap: print "\t bitmap not assigned" else: bs = Bits(bytes=map.bitmap) reconst_pepseq = '' if (not bs.count(1) == len(unaligned_sequence)): print "\talnd seq mismatch" else: usi = iter(unaligned_sequence) for c in bs.bin: if c == '0': reconst_pepseq += '-' else: reconst_pepseq += next(usi) print "\tbinary : ", bs.bin print "\talnd seq: ", reconst_pepseq print cursor.close() db.close()
def main(): no_threads = 1 special = None if len(sys.argv) > 1 and len(sys.argv) < 3: print "usage: %s <set name> <number of threads> " % sys.argv[0] exit(1) elif len(sys.argv) == 3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species(cursor) species = 'homo_sapiens' switch_to_db(cursor, ensembl_db_name[species]) if special: print "using", special, "set" gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special) else: print "using all protein coding genes" switch_to_db(cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1) incomplete = 0 genes_checked = 0 #for gene_id in gene_list: #for gene_id in [743609]: for sampling_count in range(1000): gene_id = choice(gene_list) genes_checked += 1 with_map = 0 tot = 0 switch_to_db(cursor, ensembl_db_name['homo_sapiens']) print gene2stable(cursor, gene_id), get_description(cursor, gene_id) # find all exons we are tracking in the database human_exons = gene2exon_list(cursor, gene_id) human_exons.sort(key=lambda exon: exon.start_in_gene) has_a_map = False for human_exon in human_exons: if (not human_exon.is_canonical or not human_exon.is_coding): continue if verbose: print print "\t human", human_exon.exon_id, human_exon.is_known print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens']) print "\t checking maps ..." maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) tot += 1 if maps: has_a_map = True with_map += 1 #print "ok" else: print "no maps for exon", human_exon.exon_id continue if verbose: for map in maps: species = map.species_2 exon = map2exon(cursor, ensembl_db_name, map) unaligned_sequence = get_exon_pepseq( cursor, exon, ensembl_db_name[species]) if (map.similarity): print "\t", species, map.source, map.exon_id_2, map.exon_known_2 print "\tmaps to ", map.exon_id_1, map.exon_known_1 print "\tsim", map.similarity, print "\tsource", map.source print "\t", unaligned_sequence if not map.bitmap: print "\t bitmap not assigned" else: bs = Bits(bytes=map.bitmap) reconst_pepseq = '' if (not bs.count(1) == len(unaligned_sequence)): print "\talnd seq mismatch" else: usi = iter(unaligned_sequence) for c in bs.bin: if c == '0': reconst_pepseq += '-' else: reconst_pepseq += next(usi) print "\tbinary : ", bs.bin print "\talnd seq: ", reconst_pepseq print if not tot == with_map: print "#### gene id: %d total exons: %d with map: %d ( = %d%%) " % \ (gene_id, tot, with_map, int(float(with_map)/tot*100) ) incomplete += 1 print "genes checked: %d, incomplete: %d" % (genes_checked, incomplete) cursor.close() db.close() print tot, with_map
def multiple_exon_alnmt(species_list, db_info): [local_db, ensembl_db_name] = db_info verbose = False db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() for species in species_list: print print "############################" print species switch_to_db (cursor, ensembl_db_name[species]) gene_ids = get_gene_ids (cursor, biotype='protein_coding') #gene_ids = get_theme_ids(cursor, cfg, 'wnt_pathway') if not gene_ids: print "no gene_ids" continue gene_ct = 0 tot = 0 ok = 0 no_maps = 0 no_pepseq = 0 no_paralogues = 0 for gene_id in gene_ids: if verbose: start = time() gene_ct += 1 if not gene_ct%100: print species, gene_ct, "genes out of", len(gene_ids) if verbose: print print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) # get the paralogues - only the representative for the family will have this paralogues = get_paras (cursor, gene_id) if not paralogues: if verbose: print "\t not a template or no paralogues" continue if verbose: print "paralogues: ", paralogues # get _all_ exons template_exons = gene2exon_list(cursor, gene_id) if (not template_exons): if verbose: print 'no exons for ', gene_id continue # find all template exons we are tracking in the database for template_exon in template_exons: if verbose: print template_exon.exon_id maps = get_maps(cursor, ensembl_db_name, template_exon.exon_id, template_exon.is_known, species=species, table='para_exon_map') if not maps: no_maps += 1 continue # output to fasta: seqname = "{0}:{1}:{2}".format('template', template_exon.exon_id, template_exon.is_known) exon_seqs_info = get_exon_seqs (cursor, template_exon.exon_id, template_exon.is_known) if not exon_seqs_info: continue [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs_info if (not pepseq): if ( template_exon.is_coding and template_exon.covering_exon <0): # this should be a master exon print "no pep seq for", template_exon.exon_id, "coding ", template_exon.is_coding, print "canonical: ", template_exon.is_canonical print "length of dna ", len(dna_seq) no_pepseq += 1 continue tot += 1 sequences = {seqname:pepseq} headers = [seqname] for map in maps: exon = map2exon(cursor, ensembl_db_name, map, paralogue=True) pepseq = get_exon_pepseq (cursor,exon) if (not pepseq): continue seqname = "{0}:{1}:{2}".format('para', map.exon_id_2, map.exon_known_2) headers.append(seqname) sequences[seqname] = pepseq fasta_fnm = "{0}/{1}_{2}_{3}.fa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known) output_fasta (fasta_fnm, headers, sequences) if (len(headers) <=1 ): print "single species in the alignment (?)" no_paralogues += 1 continue # align afa_fnm = "{0}/{1}_{2}_{3}.afa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known) mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm) ret = commands.getoutput(mafftcmd) # read in the alignment inf = erropen(afa_fnm, "r") if not inf: print gene_id continue template_seq_seen = False for record in SeqIO.parse(inf, "fasta"): ### store the alignment as bitstring # Generate the bitmap bs = Bits(bin='0b' + re.sub("[^0]","1", str(record.seq).replace('-','0'))) msa_bitmap = bs.tobytes() # Retrieve information on the cognate label, cognate_exon_id, cognate_exon_known = record.id.split(':') if (label == 'template'): template_seq_seen = True # Write the bitmap to the database #print "updating: ", template_exon.exon_id store_or_update(cursor, "para_exon_map", {"cognate_exon_id" :cognate_exon_id, "cognate_exon_known" :cognate_exon_known, "exon_id" :template_exon.exon_id, "exon_known" :template_exon.is_known}, {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)}) inf.close() ok += 1 commands.getoutput("rm "+afa_fnm+" "+fasta_fnm) if verbose: print " time: %8.3f\n" % (time()-start); outstr = species + " done \n" outstr += "tot: %d ok: %d \n" % (tot, ok) outstr += "no maps %d \n" % no_pepseq outstr += "no pepseq %d \n" % no_pepseq outstr += "no paralogues %d \n" % no_paralogues outstr += "\n" print outstr
def main(): db = connect_to_mysql() acg = AlignmentCommandGenerator() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) if len(sys.argv) > 1: species_list = sys.argv[1:] else: species_list = all_species ############################ for species in species_list: print print "############################" print species switch_to_db(cursor, ensembl_db_name[species]) if (species == 'homo_sapiens'): gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1) else: gene_ids = get_gene_ids(cursor, biotype='protein_coding') ct = 0 tot = 0 for tot in range(1000): #for gene_id in gene_ids: #tot += 1 gene_id = choice(gene_ids) # find all canonical coding exons associated with the gene id exons = get_canonical_coding_exons(cursor, gene_id) if (not exons): ct += 1 print gene_id, gene2stable( cursor, gene_id=gene_id), " no exons found ", ct, tot if not tot % 100: print species, tot, ct # add up the coding length of the canonical exons exons.sort(key=lambda exon: exon.start_in_gene) inside_the_coding_range = False start_properly_marked = False length = 0 for exon in exons: if not exon.canon_transl_start is None: start_properly_marked = True # if it is not propermy marked, we'll never start reading inside_the_coding_range = True length -= exon.canon_transl_start - 1 if not exon.canon_transl_end is None: inside_the_coding_range = False length += exon.canon_transl_end if inside_the_coding_range: length += exon.end_in_gene - exon.start_in_gene + 1 # take that all exons are coding full length if there is no start and end annotation # (this I believe is the case for predicted transcripts) if not start_properly_marked: length = 0 for exon in exons: length += exon.end_in_gene - exon.start_in_gene + 1 if (not length): print gene2stable( cursor, gene_id=gene_id), " no exons marked as canonical" continue # what is the length of the canonical transcript according to Ensembl canonical_translation = get_canonical_transl(acg, cursor, gene_id, species, strip_X=False) if (not canonical_translation): print "no canonical transl found for ", gene_id continue if (abs(length / 3 - len(canonical_translation)) > 3): ct += 1 print gene_id, gene2stable(cursor, gene_id), get_description( cursor, gene_id) print "(length of all exons)/3 ", length / 3, print " does not match reported canonical transl len ", len( canonical_translation) if False: # print out all exons print "exons:" inspect(exons) print print 'canonical sequence' print re.sub( "(.{50})", "\\1\n", canonical_translation ) # print canonical sequence with \n stuck in every 50 positions print # print out exons more carefully filtered to belong to the canonical version of the translation print get_translated_region_talkative(cursor, gene_id, species) all_exons = gene2exon_list(cursor, gene_id) print "all exons:" inspect(all_exons) print compare_seqs(canonical_translation, translated_seq, verbose=False) exit(1) print species, "checked a sample of ", tot + 1, "genes; problematic:", ct cursor.close() db.close() # # print 'Note: some problems could not have be resolved up to this point,' # print 'becasue we have not really looged at the exons seqs yet.' # print 'For example, for MP furo the, start fo the cannonical translation' # print 'is sometimes given in the middle of NNNNN region.' # return True
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) for species in all_species: if not species=='homo_sapiens': continue print print species switch_to_db (cursor, ensembl_db_name[species]) if (species=='homo_sapiens'): gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) else: gene_ids = get_gene_ids (cursor, biotype='protein_coding') tot_exons = 0 no_exon_seq = 0 short_dna = 0 pepseq_ok = 0 mismatch = 0 stored_incorrect = 0 translation_fail = 0 ##################################### #for gene_id in [10092907]: for gene_id in gene_ids: #for tot in range(1000): #gene_id = choice(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for gene', gene_id sys.exit(1) for exon in exons: ##################################### if not exon.is_coding: print exon.exon_id, " not coding " continue if exon.covering_exon >0: print exon.exon_id, " is covered by ", exon.covering_exon continue tot_exons += 1 # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): no_exon_seq += 1 print "no exon seqs for ", gene_id, exon.exon_id #exit(1) continue [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs if len(dna_seq)<3: short_dna += 1 print "short_dna:", dna_seq continue if (pepseq_transl_start == -10): # ??? what is this shit? adn what happens downstream if the pepseq_transl_start is None? translation_fail += 1 print "pepseq_transl_start:", pepseq_transl_start continue mitochondrial = is_mitochondrial(cursor, gene_id) dnaseq = Seq (dna_seq[pepseq_transl_start:pepseq_transl_end], generic_dna) if (mitochondrial): pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring() else: pepseq2 = dnaseq.translate().tostring() if True: print exon.exon_id print "pep stored:", pepseq print "dna transl:", pepseq2 print "dna begin:", dna_seq[:12] print "start:" , pepseq_transl_start, print "end:", pepseq_transl_end print if (not pepseq == pepseq2): stored_incorrect += 1 else: pepseq_ok += 1 print "total coding exons ", tot_exons print "no exon seq info ", no_exon_seq print "short dna ", short_dna print "transl failure ", translation_fail print "stored pepseq does not correspond to the translation of stored dna: ", stored_incorrect print "pepseq ok ", pepseq_ok cursor.close() db .close()
def main(): special = None no_threads = 1 db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) if special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) # loop over all genes sw_count = 0 tot_count = 0 for human_gene_id in gene_list: switch_to_db (cursor, ensembl_db_name['homo_sapiens']) human_stable = gene2stable (cursor, human_gene_id) human_description = get_description(cursor, human_gene_id) tot_count += 1 #print human_gene_id, human_stable, human_description human_exons = [e for e in gene2exon_list(cursor, human_gene_id, verbose=True) if e.covering_exon < 0 and e.is_canonical and e.is_known] if not human_exons: #print "\t\t", human_stable, "no exons found" continue human_exons.sort(key=lambda exon: exon.start_in_gene) # loop over all exons in this gene maps_for_exon = {} for he in human_exons: he.stable_id = exon2stable (cursor, he.exon_id, ensembl_db_name['homo_sapiens']) he.pepseq = get_exon_pepseq (cursor, he, ensembl_db_name['homo_sapiens']) # maps cleanup: get rid of maps that have "none" as similarity maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data if not maps_for_exon[he]: continue #maps_for_exon[he] = filter (lambda m: m.source == 'sw_sharp' or m.source == 'usearch', # maps_for_exon[he]) maps_for_exon[he] = filter (lambda m: m.source == 'usearch', maps_for_exon[he]) if not maps_for_exon[he]: #print "\t\t", human_stable, "no maps found" continue sw_count += len(maps_for_exon[he]) #break print "tot count: ", tot_count print "sw count: ", sw_count #print "tot count: ", tot_count #print "sw count: ", sw_count cursor.close() db.close()
def multiple_exon_alnmt(gene_list, db_info): print "process pid: %d, length of gene list: %d" % ( get_process_id(), len(gene_list)) [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) # for each human gene gene_ct = 0 tot = 0 ok = 0 no_maps = 0 no_pepseq = 0 no_orthologues = 0 min_similarity = cfg.get_value('min_accptbl_exon_sim') #gene_list.reverse() for gene_id in gene_list: start = time() gene_ct += 1 if not gene_ct%10: print gene_ct, "genes out of", len(gene_list) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) print gene_ct, len(gene_ids), gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) human_exons = filter (lambda e: e.is_known==1 and e.is_coding and e.covering_exon<0, gene2exon_list(cursor, gene_id)) human_exons.sort(key=lambda exon: exon.start_in_gene) ################################################################## for human_exon in human_exons: tot += 1 # find all orthologous exons the human exon maps to maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) if verbose: print "\texon no.", tot, " id", human_exon.exon_id, if not maps: print " no maps" print human_exon print if not maps: no_maps += 1 continue # human sequence to fasta: seqname = "{0}:{1}:{2}".format('homo_sapiens', human_exon.exon_id, human_exon.is_known) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = get_exon_seqs (cursor, human_exon.exon_id, human_exon.is_known) if (not pepseq): if verbose and human_exon.is_coding and human_exon.covering_exon <0: # this should be a master exon print "no pep seq for", human_exon.exon_id, "coding ", human_exon.is_coding, print "canonical: ", human_exon.is_canonical print "length of dna ", len(dna_seq) no_pepseq += 1 continue # collect seq from all maps, and output them in fasta format hassw = False headers = [] sequences = {} exons_per_species = {} for map in maps: switch_to_db (cursor, ensembl_db_name[map.species_2]) if map.similarity < min_similarity: continue exon = map2exon(cursor, ensembl_db_name, map) pepseq = get_exon_pepseq (cursor,exon) if (not pepseq): continue if map.source == 'sw_sharp': exon_known_code = 2 hassw = True elif map.source == 'usearch': exon_known_code = 3 hassw = True else: exon_known_code = map.exon_known_2 seqname = "{0}:{1}:{2}".format(map.species_2, map.exon_id_2, exon_known_code) headers.append(seqname) sequences[seqname] = pepseq # for split exon concatenation (see below) if not map.species_2 in exons_per_species.keys(): exons_per_species[map.species_2] = [] exons_per_species[map.species_2].append ([ map.exon_id_2, exon_known_code]); if (len(headers) <=1 ): if verbose: print "single species in the alignment" no_orthologues += 1 continue # concatenate exons from the same gene - the alignment program might go wrong otherwise concatenated = concatenate_exons (cursor, ensembl_db_name, sequences, exons_per_species) fasta_fnm = "{0}/{1}.fa".format( cfg.dir_path['scratch'], human_exon.exon_id) output_fasta (fasta_fnm, sequences.keys(), sequences) # align afa_fnm = "{0}/{1}.afa".format( cfg.dir_path['scratch'], human_exon.exon_id) mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm) ret = commands.getoutput(mafftcmd) if (verbose): print 'almt to', afa_fnm # read in the alignment inf = erropen(afa_fnm, "r") aligned_seqs = {} for record in SeqIO.parse(inf, "fasta"): aligned_seqs[record.id] = str(record.seq) inf.close() # split back the concatenated exons if concatenated: split_concatenated_exons (aligned_seqs, concatenated) human_seq_seen = False for seq_name, sequence in aligned_seqs.iteritems(): # if this is one of the concatenated seqs, split them back to two ### store the alignment as bitstring # Generate the bitmap bs = Bits(bin='0b' + re.sub("[^0]","1", sequence.replace('-','0'))) # The returned value of tobytes() will be padded at the end # with between zero and seven 0 bits to make it byte aligned. # I will end up with something that looks like extra alignment gaps, that I'll have to return msa_bitmap = bs.tobytes() # Retrieve information on the cognate cognate_species, cognate_exon_id, cognate_exon_known = seq_name.split(':') if cognate_exon_known == '2': source = 'sw_sharp' elif cognate_exon_known == '3': source = 'usearch' else: source = 'ensembl' if (cognate_species == 'homo_sapiens'): human_seq_seen = True cognate_genome_db_id = species2genome_db_id(cursor, cognate_species) # moves the cursor switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # so move it back to h**o sapiens # Write the bitmap to the database #if (cognate_species == 'homo_sapiens'): if verbose: # and (source=='sw_sharp' or source=='usearch'): print "storing" print human_exon.exon_id, human_exon.is_known print cognate_species, cognate_genome_db_id, cognate_exon_id, cognate_exon_known, source print sequence if not msa_bitmap: print "no msa_bitmap" continue store_or_update(cursor, "exon_map", {"cognate_genome_db_id":cognate_genome_db_id, "cognate_exon_id":cognate_exon_id ,"cognate_exon_known" :cognate_exon_known, "source": source, "exon_id" :human_exon.exon_id, "exon_known":human_exon.is_known}, {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)}) ok += 1 commands.getoutput("rm "+afa_fnm+" "+fasta_fnm) if verbose: print " time: %8.3f\n" % (time()-start); print "tot: ", tot, "ok: ", ok print "no maps ", no_pepseq print "no pepseq ", no_pepseq print "no orthologues ", no_orthologues print
def main(): parameter = {} # in case I ever have to handle multiple versions of ensembl # (but for now I don't have enough space) # note though that there are functions in el_utils/mysql.py that assume # that whatever ensembl stuff is available to the mysql server corresponds to the same release release_number = '76' parameter['ensembl_release_number'] = release_number parameter['blastp_e_value'] = "1.e-10" # it will be used as a string when fmting the blastp cmd parameter['min_accptbl_exon_sim'] = 0.33333 #minimum acceptable exon similarity dir_path = {} dir_path['ensembl_fasta'] = '/mnt/ensembl-mirror/release-'+release_number+'/fasta' # local juggling of data from one database base to the other dir_path['afs_dumps'] = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/' dir_path['afs_dumps'] += 'ExoLocator/results/dumpster' dir_path['resources'] = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/' dir_path['resources'] += 'pypeworks/exolocator/resources' dir_path['scratch'] = '/tmp' dir_path['maxentscan'] = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/' dir_path['maxentscan'] += 'pypeworks/exolocator/pl_utils/maxentscan' util_path = {} util_path['mafft'] = '/usr/bin/mafft' util_path['blastall'] = '/usr/bin/blastall' util_path['fastacmd'] = '/usr/bin/fastacmd' util_path['sw#'] = '/usr/bin/swsharp' util_path['usearch'] = '/usr/bin/usearch' util_path['score3'] = dir_path['maxentscan'] + '/score3.pl' util_path['score5'] = dir_path['maxentscan'] + '/score5.pl' if 1: # check if the paths are functioning (at this point at least) for util in util_path.values(): if (not os.path.exists(util)): print util, " not found " sys.exit (1) for dir in dir_path.values(): if (not os.path.exists(dir)): print dir, " not found " sys.exit (1) if (not os.path.isdir (dir)): print dir, " is not a directory " sys.exit (1) db = connect_to_mysql() cursor = db.cursor() ####################################################### # check if the config db exists -- if not, make it db_name = "exolocator_config" qry = "show databases like'%s'" % db_name rows = search_db (cursor, qry) if (not rows): print db_name, "database not found" qry = "create database %s " % db_name rows = search_db (cursor, qry) if (rows): print "some problem creating the database ..." rows = search_db (cursor, qry, verbose = True) else: print db_name, "database found" qry = "use %s " % db_name search_db (cursor, qry) # make tables for table in ['util_path', 'dir_path', 'parameter']: if ( check_table_exists (cursor, db_name, table)): print table, " found in ", db_name else: print table, " not found in ", db_name make_table (cursor, table) # fill util, dir and path tables fixed_fields = {} update_fields = {} for [name, path] in util_path.iteritems(): fixed_fields['name'] = name update_fields['path'] = path store_or_update (cursor, 'util_path', fixed_fields, update_fields) fixed_fields = {} update_fields = {} for [name, path] in dir_path.iteritems(): fixed_fields['name'] = name update_fields['path'] = path store_or_update (cursor, 'dir_path', fixed_fields, update_fields) fixed_fields = {} update_fields = {} for [name, value] in parameter.iteritems(): fixed_fields['name'] = name update_fields['value'] = value store_or_update (cursor, 'parameter', fixed_fields, update_fields) ####################################################### # add trivial names to ncbi_taxonomy.names [all_species, ensembl_db_name] = get_species (cursor) feed_trivial_names (cursor, all_species) ####################################################### # add species shorthands (used in ENS* names formation) # though we will not needed unit the paralogue alignment reconstruction point) feed_name_shorthands (cursor, all_species) cursor.close() db.close()
def dump_exons(species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() out_path = "{0}/exons".format(cfg.get_path('afs_dumps')) if not os.path.exists(out_path): print out_path, "not found" exit(1) # exit on failed output dir check for species in species_list: #if (not species=='homo_sapiens'): # continue outfile = "{0}/{1}_exon_dump.txt".format(out_path, species) of = erropen(outfile, "w") if not of: continue switch_to_db(cursor, ensembl_db_name[species]) if (species == 'homo_sapiens'): gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1, ref_only=True) else: gene_ids = get_gene_ids(cursor, biotype='protein_coding') source = get_analysis_dict(cursor) ct = 0 for gene_id in gene_ids: ct += 1 if (not ct % 1000): print species, ct, len(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for ', gene_id continue for exon in exons: if exon.covering_exon > 0: continue # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): continue # human readable string describing the source of annotation for this exon if exon.is_known == 2: analysis = 'sw_sharp' elif exon.is_known == 3: analysis = 'usearch' else: analysis = source[exon.analysis_id] # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it gene_stable_id = gene2stable(cursor, gene_id) if (exon.is_known == 1): exon_stable_id = exon2stable(cursor, exon.exon_id) elif (exon.is_known == 2): exon_stable_id = 'sw_sharp_' + str(exon.exon_id) elif (exon.is_known == 3): exon_stable_id = 'usearch_' + str(exon.exon_id) else: exon_stable_id = "anon" print >> of, exon_tabstring(exon, gene_stable_id, exon_stable_id, species, analysis, exon_seqs[1:]) of.close() print species, "done" cursor.close() db.close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) for species in all_species: if not species == 'homo_sapiens': continue print print species switch_to_db(cursor, ensembl_db_name[species]) if (species == 'homo_sapiens'): gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1) else: gene_ids = get_gene_ids(cursor, biotype='protein_coding') tot_exons = 0 no_exon_seq = 0 short_dna = 0 pepseq_ok = 0 mismatch = 0 stored_incorrect = 0 translation_fail = 0 ##################################### #for gene_id in [10092907]: for gene_id in gene_ids: #for tot in range(1000): #gene_id = choice(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for gene', gene_id sys.exit(1) for exon in exons: ##################################### if not exon.is_coding: print exon.exon_id, " not coding " continue if exon.covering_exon > 0: print exon.exon_id, " is covered by ", exon.covering_exon continue tot_exons += 1 # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): no_exon_seq += 1 print "no exon seqs for ", gene_id, exon.exon_id #exit(1) continue [ exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq ] = exon_seqs if len(dna_seq) < 3: short_dna += 1 print "short_dna:", dna_seq continue if ( pepseq_transl_start == -10 ): # ??? what is this shit? adn what happens downstream if the pepseq_transl_start is None? translation_fail += 1 print "pepseq_transl_start:", pepseq_transl_start continue mitochondrial = is_mitochondrial(cursor, gene_id) dnaseq = Seq(dna_seq[pepseq_transl_start:pepseq_transl_end], generic_dna) if (mitochondrial): pepseq2 = dnaseq.translate( table="Vertebrate Mitochondrial").tostring() else: pepseq2 = dnaseq.translate().tostring() if True: print exon.exon_id print "pep stored:", pepseq print "dna transl:", pepseq2 print "dna begin:", dna_seq[:12] print "start:", pepseq_transl_start, print "end:", pepseq_transl_end print if (not pepseq == pepseq2): stored_incorrect += 1 else: pepseq_ok += 1 print "total coding exons ", tot_exons print "no exon seq info ", no_exon_seq print "short dna ", short_dna print "transl failure ", translation_fail print "stored pepseq does not correspond to the translation of stored dna: ", stored_incorrect print "pepseq ok ", pepseq_ok cursor.close() db.close()