def main(): special = None no_threads = 1 db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) total = 0 for species in all_species: print species switch_to_db(cursor, ensembl_db_name[species]) qry = "select count(1) from usearch_exon" rows = search_db(cursor, qry) count = int(rows[0][0]) print "\t usearch exons: ", count total += count qry = "select count(1) from sw_exon" rows = search_db(cursor, qry) count = int(rows[0][0]) print "\t sw exons: ", count total += count print print 'total: ', total cursor.close() db.close()
def main(): special = None no_threads = 1 db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) total = 0 for species in all_species: print species switch_to_db (cursor, ensembl_db_name[species]) qry = "select count(1) from usearch_exon" rows = search_db (cursor, qry) count = int(rows[0][0]) print "\t usearch exons: ", count total += count qry = "select count(1) from sw_exon" rows = search_db (cursor, qry) count = int(rows[0][0]) print "\t sw exons: ", count total += count print print 'total: ', total cursor.close() db.close()
def make_parameter_table (cursor): """ Creates parameter table in the config database. @param [cursor] db cursor, assumed top be pointing to the config database @retval True on success @retval False on failure; in that case the seach_db() call is repeated in verbose mode. """ table = 'parameter' print "making ", table qry = "create table " + table + " (id int(10) primary key auto_increment)" rows = search_db (cursor, qry, verbose=True) if (rows): return False # make the columns for column in ['name', 'value']: qry = "alter table %s add %s varchar (50)" % (table, column) rows = search_db (cursor, qry, verbose=True) if (rows): return False return False
def main(): db = connect_to_mysql() cr = ConfigurationReader() cursor = db.cursor() fasta_path = cr.get_path('ensembl_fasta') [all_species, ensembl_db_name] = get_species (cursor) for species in all_species: #for species in ['danio_rerio']: print species dna_path = "{0}/{1}/dna".format(fasta_path, species) if (not os.path.exists(dna_path)): print "problem:", dna_path, "not found" exit(1) fasta_files = [] for r,d,files in os.walk(dna_path): for file in files: if (not file[-3:] == ".fa"): continue fasta_files.append(file) name2file = {} for file in fasta_files: print dna_path, file cmd = "grep '>' {0}/{1}".format(dna_path, file) ret = commands.getoutput(cmd) headers = ret.split("\n") print "number of headers: ", len(headers) for hdr in headers: fields = hdr.split(" ") name = fields[0].replace (">", "") #print name if (not name2file.has_key(name)): name2file[name] = [] name2file[name].append(file) qry = "use "+ensembl_db_name[species] search_db (cursor, qry) for name in name2file.keys(): file_names = "" for file in name2file[name]: if file_names: file_names += " " file_names += file store_seq_filenames (cursor, name, file_names) cursor.close() db .close()
def check_table_sizes(cursor, all_species, ensembl_db_name): for species in all_species: print print "##########################" print species qry = "use " + ensembl_db_name[species] search_db(cursor, qry) qry = "show tables" rows = search_db(cursor, qry) for row in rows: table = row[0] qry = " select count(1) from " + table rows = search_db(cursor, qry) table_size = rows[0][0] print "\t ", table, table_size
def check_table_sizes (cursor, all_species, ensembl_db_name): for species in all_species: print print "##########################" print species qry = "use "+ensembl_db_name[species] search_db(cursor, qry) qry = "show tables" rows = search_db(cursor, qry) for row in rows: table = row[0] qry = " select count(1) from "+table rows = search_db(cursor, qry) table_size = rows[0][0] print "\t ", table, table_size
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) for species in all_species: print species switch_to_db (cursor, ensembl_db_name[species]) qry = "select seq_region.name, seq_region.file_name from seq_region, gene " qry += " where gene.biotype='protein_coding' and gene.seq_region_id = seq_region.seq_region_id " rows = search_db (cursor, qry) if (not rows): print "\t no seq region info found " continue tot = 0 no_file = 0 for row in rows: [name, file_name] = row #print name, file_name tot += 1 if (not file_name): no_file += 1 print name, file_name #exit (1) print "\t tot seq_regions: ", tot, " no file: ", no_file cursor.close() db .close()
def main (): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader (user="******", passwd="tooiram", check=False) inpath = cfg.get_path('afs_dumps') indir = "%s/exon_map" % inpath infile = "%s/exon_map.sql" % indir if (not os.path.exists(infile)): print "not found: ", infile sys.exit(1) print "reading", infile qry = "drop table exon_map" rows = search_db(cursor, qry) # I could not get this to run, though it runs fine directly from the mysql shell: #qry = "source %s" % infile #rows = search_db(cursor, qry, verbose=True) cursor.close() db.close() credentials = " -u marioot -ptooiram" cmd = "mysql %s exolocator_db < %s" % (credentials, infile) print cmd ret = commands.getoutput(cmd) print ret return True
def get_seq_region_info(cursor, name): qry = "select * from seq_region where name = '%s'" % name rows = search_db (cursor, qry) if(len(rows) > 1): print "more than one entry associated with ", name exit (1) return rows[0]
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db(cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) inpath = cfg.get_path('afs_dumps') indir = "%s/exon_map" % inpath infile = "%s/exon_map.sql" % indir if (not os.path.exists(infile)): print "not found: ", infile sys.exit(1) print "reading", infile qry = "drop table exon_map" rows = search_db(cursor, qry) # I could not get this to run, though it runs fine directly from the mysql shell: #qry = "source %s" % infile #rows = search_db(cursor, qry, verbose=True) cursor.close() db.close() credentials = " -u marioot -ptooiram" cmd = "mysql %s exolocator_db < %s" % (credentials, infile) print cmd ret = commands.getoutput(cmd) print ret return True
def search_description(cursor, gene_name): qry = "select gene_id, description from gene " qry += "where description like '%" + gene_name + "%'" rows = search_db(cursor, qry) if not rows: return ["", ""] else: return rows[0]
def search_description (cursor, gene_name): qry = "select gene_id, description from gene " qry += "where description like '%"+gene_name+"%'" rows = search_db (cursor, qry) if not rows: return ["", ""] else: return rows[0]
def get_exon_end(cursor, exon_id): qry = "select seq_region_end from exon " qry += "where exon_id = %d " % exon_id rows = search_db (cursor, qry) if (not rows or 'Error' in rows[0]): print "start not found for ", exon_id return None return rows[0][0]
def get_phase(cursor, exon_id): qry = "select is_coding, phase, gene_id from gene2exon where exon_id = %d" % exon_id rows = search_db(cursor, qry) if (rows): [is_coding, phase, gene_id] = rows[0] else: [is_coding, phase, gene_id] = [0,0,0] return [is_coding, phase, gene_id]
def get_exon_end(cursor, exon_id): qry = "select seq_region_end from exon " qry += "where exon_id = %d " % exon_id rows = search_db(cursor, qry) if (not rows or 'Error' in rows[0]): print "start not found for ", exon_id return None return rows[0][0]
def cleanup_endphase (cursor, exon): qry = "select phase, end_phase from exon where exon_id = %d " % exon.exon_id rows = search_db (cursor, qry) if not rows: exon.phase = 0 exon.end_phase = 0 else: exon.phase = rows[0][0] exon.end_phase = rows[0][1]
def make_path_table (cursor, table): print "making ", table qry = "create table " + table + " (id int(10) primary key auto_increment)" rows = search_db (cursor, qry, verbose=True) if (rows): return False # make the columns column = 'name' qry = "alter table %s add %s varchar (20)" % (table, column) rows = search_db (cursor, qry, verbose=True) if (rows): return False column = 'path' qry = "alter table %s add %s blob" % (table, column) rows = search_db (cursor, qry, verbose=True) if (rows): return False
def transcript_id2exon_ids (cursor, transcript_id): exon_ids = [] qry = "select exon_id from exon_transcript " qry += " where transcript_id = %d " % transcript_id rows = search_db (cursor, qry) if (not rows): return [] for row in rows: exon_ids.append(row[0]) return exon_ids
def gene_name2gene_id(cursor, gene_name): qry = "select ensembl_id from object_xref, external_synonym " qry += "where object_xref.ensembl_object_type = 'Gene' " qry += "and object_xref.xref_id= external_synonym.xref_id " qry += "and external_synonym.synonym = '%s' " % gene_name qry += "group by synonym" rows = search_db(cursor, qry) if not rows: return "" else: return rows[0][0]
def map_cleanup (cursor, ensembl_db_name, human_exons): switch_to_db(cursor,ensembl_db_name['homo_sapiens']) for exon in human_exons: qry = "delete from exon_map where exon_id = %d " % exon.exon_id qry += " and exon_known = %d " % exon.is_known qry += " and cognate_exon_known > 1 " qry += " and similarity is NULL" rows = search_db (cursor, qry, verbose=False) return True
def gene_name2gene_id(cursor, gene_name): qry = "select ensembl_id from object_xref, external_synonym " qry += "where object_xref.ensembl_object_type = 'Gene' " qry += "and object_xref.xref_id= external_synonym.xref_id " qry += "and external_synonym.synonym = '%s' " % gene_name qry += "group by synonym" rows = search_db (cursor, qry) if not rows: return "" else: return rows[0][0]
def transcript_id2exon_ids(cursor, transcript_id): exon_ids = [] qry = "select exon_id from exon_transcript " qry += " where transcript_id = %d " % transcript_id rows = search_db(cursor, qry) if (not rows): return [] for row in rows: exon_ids.append(row[0]) return exon_ids
def make_seqregion2file_table (cursor): table = 'seqregion2file' qry = "create table " + table + " (seqregion_id int(10) primary key)" rows = search_db (cursor, qry) if (rows): return False # make the columns column = 'seq_name' qry = "alter table %s add %s varchar (100)" % (table, column) rows = search_db (cursor, qry) if (rows): return False column = 'file_name' qry = "alter table %s add %s blob" % (table, column) rows = search_db (cursor, qry) if (rows): return False
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader (user="******", passwd="tooiram", check=False) in_path = cfg.get_path('afs_dumps') in_path += "/para_dump" if (not os.path.exists(in_path)): print in_path, "not found" sys.exit(1) # exit on non-existent outdir ############### if 1: qry = "drop table paralog" search_db (cursor, qry) qry = "create table paralog (id int(10) primary key auto_increment) " search_db (cursor, qry) qry = "alter table paralog ADD gene_id1 varchar(30) " search_db (cursor, qry) qry = "alter table paralog ADD gene_id2 varchar(30) " search_db (cursor, qry) create_index (cursor, db_name,'gene_id_index', 'paralog', ['gene_id1', 'gene_id2']) ############### os.chdir(in_path) filenames = glob.glob("*_para_dump.txt") ############### for infile in filenames: print infile store(cursor, infile) cursor.close() db .close()
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db(cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) in_path = cfg.get_path('afs_dumps') in_path += "/para_dump" if (not os.path.exists(in_path)): print in_path, "not found" sys.exit(1) # exit on non-existent outdir ############### if 1: qry = "drop table paralog" search_db(cursor, qry) qry = "create table paralog (id int(10) primary key auto_increment) " search_db(cursor, qry) qry = "alter table paralog ADD gene_id1 varchar(30) " search_db(cursor, qry) qry = "alter table paralog ADD gene_id2 varchar(30) " search_db(cursor, qry) create_index(cursor, db_name, 'gene_id_index', 'paralog', ['gene_id1', 'gene_id2']) ############### os.chdir(in_path) filenames = glob.glob("*_para_dump.txt") ############### for infile in filenames: print infile store(cursor, infile) cursor.close() db.close()
def check_ccds(cursor, transcript_stable_id): ccds = "" qry = "select dna_align_feature.hit_name " qry += "from dna_align_feature, transcript, transcript_supporting_feature " qry += " where dna_align_feature.dna_align_feature_id = transcript_supporting_feature.feature_id " qry += " and transcript_supporting_feature.feature_type ='dna_align_feature' " qry += " and transcript_supporting_feature.transcript_id =transcript.transcript_id " qry += " and transcript.stable_id = '%s' " % transcript_stable_id rows = search_db(cursor, qry) if not rows: return ccds for row in rows: if 'CCDS' in row[0]: ccds = row[0] return ccds
def dump_orthos (species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) # in the afa headers use 'trivial' names for the species: cow, dog, pig, ... trivial_name = translate_to_trivial(cursor, all_species) out_path = cfg.get_path('afs_dumps') outfile = "{0}/orthologue_dump.txt".format(out_path) print outfile of = erropen (outfile,"w") species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) qry = "select * from orthologue" rows = search_db (cursor, qry) for row in rows: [pair_id, human_gene_id, cognate_gene_id, genome_db_id, source] = row species = genome_db_id2species (cursor, genome_db_id) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) human_stable_id = gene2stable(cursor, human_gene_id) switch_to_db (cursor, ensembl_db_name[species]) cognate_stable_id = gene2stable(cursor, cognate_gene_id) print >>of, orthos_tabstring ([human_stable_id, cognate_stable_id, species, trivial_name[species]]) of.close() cursor.close() db .close()
def check_ccds (cursor, transcript_stable_id): ccds = "" qry = "select dna_align_feature.hit_name " qry += "from dna_align_feature, transcript, transcript_supporting_feature " qry += " where dna_align_feature.dna_align_feature_id = transcript_supporting_feature.feature_id " qry += " and transcript_supporting_feature.feature_type ='dna_align_feature' " qry += " and transcript_supporting_feature.transcript_id =transcript.transcript_id " qry += " and transcript.stable_id = '%s' " % transcript_stable_id rows = search_db(cursor, qry) if not rows: return ccds for row in rows: if 'CCDS' in row[0]: ccds = row[0] return ccds
def get_theme_ids(cursor, cfg, theme_name): resources = cfg.dir_path['resources'] fnm = resources + '/' + theme_name+'.txt' if not os.path.exists(fnm): print fnm, "not found" exit(1) if not os.path.getsize(fnm) > 0: print fnm, "empty" exit(1) inf = erropen(fnm, "r") gene_ids = [] for line in inf: line.rstrip() [stable_id, name] = line.split("\t") qry = "select gene_id, description from gene where stable_id='%s'"% stable_id rows = search_db (cursor, qry) if not rows: continue gene_ids.append(rows[0][0]) inf.close() return gene_ids
def alt_splice_almt (cursor, cfg, acg, species, ensembl_db_name): flank_length = 10 print "############################" print 'checking alt splicing in ', species qry = "use " + ensembl_db_name[species] search_db(cursor, qry) gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) if species == 'homo_sapiens': spec_short = 'HSA' else: spec_short = 'MMU' outdir = "{0}/alt/{1}".format(cfg.dir_path['afs_dumps'], spec_short) if not os.path.exists(outdir): os.makedirs(outdir) ######################################## ######################################## ######################################## #gene_ids.reverse() for gene_id in gene_ids: #for gene_id in [429349]: #for count in range(1000): #gene_id = choice (gene_ids) stable_gene_id = gene2stable(cursor, gene_id) if verbose: print gene_id, stable_gene_id, get_description (cursor, gene_id) transcript_ids = get_transcript_ids(cursor, gene_id) tr_w_ccds = [] for [tr_id, tr_stable] in transcript_ids: ccds = check_ccds (cursor, tr_stable) if not ccds: continue tr_w_ccds.append([tr_id, tr_stable]) if not tr_w_ccds: continue # get all exons for this gene all_exons = gene2exon_list (cursor, gene_id) exons_w_ccds = set([]) # get the unique_ids # find exons which are on the ccds list for [tr_id, tr_stable] in tr_w_ccds: exon_ids = transcript_id2exon_ids (cursor, tr_id) exons_w_ccds.update( set(exon_ids)) # for these exons check sequence is_known = 1 bad_exon = set([]) for exon_id in exons_w_ccds: exon = get_exon (cursor, exon_id, is_known) seq = get_exon_seqs (cursor, exon_id, is_known) if not seq: bad_exon.add(exon_id) continue [exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = seq if exon.covering_exon < 0: if not dna_seq: bad_exon.add(exon_id) else: if exon.covering_exon_known and exon.covering_exon in exons_w_ccds: pass else: all_exon_ids = map(lambda exon: exon.exon_id, all_exons) if not exon.covering_exon in all_exon_ids: bad_exon.add(exon_id) # which transcripts seem to be completely ok? if verbose: print "reconstructing alt splice almts for " if verbose: print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) if verbose: print "there are ", len(tr_w_ccds), " transscripts with ccds" # get the gene_sequence ret = get_gene_seq(acg, cursor, gene_id, species) [gene_seq, canonical_exon_pepseq, file_name, seq_name, seq_region_start, seq_region_end] = ret output_seq = {} global_boundaries = [] local_boundaries = {} # sort exons by the order in which they appear in the gene all_exons.sort(key=lambda exon: exon.start_in_gene) # a bit of a cleanup for exon in all_exons: cleanup_endphase (cursor, exon) # check if any of the translations is complete: no_ok_transcripts = True for [tr_id, tr_stable] in tr_w_ccds: tr_exon_ids = transcript_id2exon_ids (cursor, tr_id) if bad_exon & set(tr_exon_ids): continue if verbose: print tr_stable, " ok " no_ok_transcripts = False if no_ok_transcripts: if verbose: print " no ok transcripts found" continue # main loop cary = "" # for patching up codons split by intron for [tr_id, tr_stable] in tr_w_ccds: tr_exon_ids = transcript_id2exon_ids (cursor, tr_id) if bad_exon & set(tr_exon_ids): continue # translation is from where to where? ret = get_translation_coords (cursor, tr_id) if not ret: continue [seq_start, start_exon_id, seq_end, end_exon_id] = ret for exon in all_exons: if exon.exon_id == start_exon_id: start_exon=exon if exon.exon_id == end_exon_id: end_exon=exon transl_start_in_gene = start_exon.start_in_gene + seq_start transl_end_in_gene = end_exon.start_in_gene + seq_end local_boundaries[tr_stable] = [] output_seq[tr_stable] = "-"*len(gene_sequence) output_seq[tr_stable+"_pep"] = "-"*len(gene_sequence) transl_end = "" for exon in all_exons: if not exon.exon_id in tr_exon_ids: continue start = exon.start_in_gene start_flank = exon.start_in_gene - flank_length if start_flank < 0: start_flank = 0 else: if not start_flank-1 in global_boundaries: global_boundaries.append(start_flank-1) local_boundaries[tr_stable].append(start_flank) end = exon.end_in_gene end_flank = exon.end_in_gene + flank_length if end_flank > len(gene_sequence): end_flank = len(gene_sequence) else: if not end_flank in global_boundaries: global_boundaries.append(end_flank) local_boundaries[tr_stable].append(end_flank) tmp_dna = output_seq[tr_stable][:start_flank] + gene_sequence[start_flank:start].lower() tmp_dna += gene_sequence[start:end] tmp_dna += gene_sequence[end:end_flank].lower() + output_seq[tr_stable][end_flank:] output_seq[tr_stable] = tmp_dna ################################################# # now try and handle translation to protein prev_transl_end = transl_end # where does translation start: if exon.end_in_gene < transl_start_in_gene: transl_start = -1 elif exon.exon_id == start_exon_id: # if this is the first exon, the transl start given above transl_start = exon.start_in_gene+seq_start-1 else: # otherwise it is the exon start - except that if this is not the # first exon and the codon is split, we want to start with the # translation of the stitched up exon transl_start = exon.start_in_gene start_flank = exon.phase # where does translation end: if exon.start_in_gene > transl_end_in_gene: transl_end = -1 elif exon.exon_id == end_exon_id: # if this is the first exon, the transl start given above transl_end = exon.start_in_gene+seq_end else: # otherwise it is the exon start - except that if this is not the # first exon and the codon is split, we want to start with the # translation of the stitched up exon transl_end = exon.end_in_gene - exon.end_phase+1 end_flank = exon.end_phase if transl_start < 0 or transl_end < 0 : continue if exon.phase > 0 and prev_transl_end: cary = gene_sequence[prev_transl_end:prev_transl_end+exon.phase] else: cary = "" [phase, pepseq] = translate (cary + gene_sequence[transl_start:transl_end], 0, mitochondrial, strip_stop = False) prev_transl_end = transl_end pepseq_padded = "" for aa in pepseq: pepseq_padded += "-"+aa+"-" pepseq_name = tr_stable+"_pep" tmp_pep = output_seq[pepseq_name][:transl_start-len(cary)] tmp_pep += pepseq_padded tmp_pep += output_seq[pepseq_name][transl_end:] output_seq[pepseq_name] = tmp_pep global_boundaries.sort() for [tr_id, tr_stable] in tr_w_ccds: seq = output_seq[tr_stable] tmp_seq = "" prev_bdry = 0 for bdry in global_boundaries: tmp_seq += seq[prev_bdry:bdry] if bdry >= len(seq): continue if bdry in local_boundaries[tr_stable]: marker = "-Z-" else: marker = "---" tmp_seq += marker prev_bdry = bdry output_seq[tr_stable] = tmp_seq pepseq_name = tr_stable+"_pep" seq = output_seq[pepseq_name] tmp_seq = "" prev_bdry = 0 for bdry in global_boundaries: tmp_seq += seq[prev_bdry:bdry] if bdry >= len(seq): continue if bdry in local_boundaries[tr_stable]: # note here marker = "-Z-" else: marker = "---" tmp_seq += marker prev_bdry = bdry output_seq[pepseq_name] = tmp_seq output_seq = strip_gaps(output_seq) # define the order in which we want the sequences output name_order = [] for [tr_id, tr_stable] in tr_w_ccds: pepseq_name = tr_stable+"_pep" name_order.append (pepseq_name) name_order.append (tr_stable) afa_fnm = "{0}/{1}.afa".format(outdir, stable_gene_id) ret = output_fasta (afa_fnm, name_order, output_seq) print afa_fnm return True
def get_translated_region_talkative(cursor, gene_id, species): # get the region on the gene is_known = (species == 'homo_sapiens') ret = get_gene_region(cursor, gene_id, is_known) if ret: [gene_seq_id, gene_region_start, gene_region_end, gene_region_strand] = ret else: print "region not retrived for ", species, gene_id, species return [] canonical_transcript_id = get_canonical_transcript_id(cursor, gene_id) transcript_ids = get_transcript_ids(cursor, gene_id) print transcript_ids print "canonical: ", canonical_transcript_id transl_region_start = gene_region_end transl_region_end = gene_region_start print "transl region start:", transl_region_start print "transl region end:", transl_region_end for [transcript_id, transcript_stable] in transcript_ids: qry = "SELECT seq_start, start_exon_id, seq_end, end_exon_id " qry += " FROM translation WHERE transcript_id=%d" % transcript_id rows = search_db(cursor, qry) if (not rows): continue exon_seq_start = rows[0][0] start_exon_id = rows[0][1] exon_seq_end = rows[0][2] end_exon_id = rows[0][3] print if transcript_id == canonical_transcript_id: print "canonical: " print "transcript id: ", transcript_id print "start exon id:", start_exon_id, "transl start (in the exon) ", exon_seq_start print "end exon id:", end_exon_id, "transl end (in the exon)", exon_seq_end if (gene_region_strand > 0): start = {} start[start_exon_id] = get_exon_start(cursor, start_exon_id) start[end_exon_id] = get_exon_start(cursor, end_exon_id) this_translation_region_start = start[ start_exon_id] + exon_seq_start - 1 this_translation_region_end = start[end_exon_id] + exon_seq_end - 1 else: end = {} end[start_exon_id] = get_exon_end(cursor, start_exon_id) end[end_exon_id] = get_exon_end(cursor, end_exon_id) this_translation_region_start = end[end_exon_id] - exon_seq_end + 1 this_translation_region_end = end[ start_exon_id] - exon_seq_start + 1 if (this_translation_region_start <= transl_region_start): transl_region_start = this_translation_region_start if (this_translation_region_end >= transl_region_end): transl_region_end = this_translation_region_end return
def pep_seqs (cursor, gene_id, exons): for exon in exons: ##################################### if (not exon.is_coding): if verbose: print exon.exon_id, "is not coding " continue if (exon.covering_exon > 0): if verbose: print exon.exon_id, "has covering exon" continue exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): if verbose: print exon.exon_id, "no exon_seqs" continue [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs if len(dna_seq)<4: if verbose: print exon.exon_id, "short dna" continue ##################################### mitochondrial = is_mitochondrial(cursor, gene_id) [seq_start, seq_end] = translation_bounds (cursor, exon.exon_id, verbose) if verbose: print " ** ", seq_start, seq_end dna_cropped = crop_dna (seq_start, seq_end, dna_seq) if verbose: print " ** ", dna_cropped [offset, length_translated, pepseq, phase_corrected] = translate (dna_cropped, exon.phase, mitochondrial, verbose) if ( offset < 0): # translation failure; usually some short pieces (end in pos 4 and such) if verbose: print exon.exon_id, "translation failure" print "mitochondrial:", mitochondrial print seq_start, seq_end continue if seq_start is None: seq_start = 1 if seq_start == 0: seq_start = 1 start = seq_start+offset-1 end = start + length_translated dnaseq = Seq (dna_seq[start:end], generic_dna) if (mitochondrial): pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring() else: pepseq2 = dnaseq.translate().tostring() if (not pepseq == pepseq2): start = -10 end = -10 if verbose: print exon.exon_id print "pep from translate:", pepseq print "dna transl:", pepseq2 print "start:" , start print "end:", end print if True: qry = "update exon_seq " qry += " set protein_seq = '%s', " % pepseq qry += " pepseq_transl_start = %d, " % start qry += " pepseq_transl_end = %d " % end qry += " where exon_seq_id = %d " % exon_seq_id rows = search_db (cursor, qry) if (rows): rows = search_db (cursor, qry, verbose = True) continue
def get_translated_region_talkative(cursor, gene_id, species): # get the region on the gene is_known = (species == 'homo_sapiens') ret = get_gene_region (cursor, gene_id, is_known) if ret: [gene_seq_id,gene_region_start, gene_region_end, gene_region_strand] = ret else: print "region not retrived for ", species, gene_id, species return [] canonical_transcript_id = get_canonical_transcript_id (cursor, gene_id) transcript_ids = get_transcript_ids(cursor, gene_id) print transcript_ids print "canonical: ", canonical_transcript_id transl_region_start = gene_region_end transl_region_end = gene_region_start print "transl region start:", transl_region_start print "transl region end:", transl_region_end for[ transcript_id, transcript_stable] in transcript_ids: qry = "SELECT seq_start, start_exon_id, seq_end, end_exon_id " qry += " FROM translation WHERE transcript_id=%d" % transcript_id rows = search_db (cursor, qry) if (not rows): continue exon_seq_start = rows[0][0] start_exon_id = rows[0][1] exon_seq_end = rows[0][2] end_exon_id = rows[0][3] print if transcript_id == canonical_transcript_id: print "canonical: " print "transcript id: ", transcript_id print "start exon id:", start_exon_id, "transl start (in the exon) ", exon_seq_start print "end exon id:", end_exon_id, "transl end (in the exon)", exon_seq_end if (gene_region_strand > 0): start = {} start[start_exon_id] = get_exon_start(cursor, start_exon_id) start[end_exon_id] = get_exon_start(cursor, end_exon_id) this_translation_region_start = start[start_exon_id] + exon_seq_start - 1 this_translation_region_end = start[end_exon_id] + exon_seq_end - 1 else: end = {} end[start_exon_id] = get_exon_end (cursor, start_exon_id) end[end_exon_id] = get_exon_end (cursor, end_exon_id) this_translation_region_start = end[end_exon_id] - exon_seq_end + 1 this_translation_region_end = end[start_exon_id] - exon_seq_start + 1 if (this_translation_region_start <= transl_region_start): transl_region_start = this_translation_region_start if (this_translation_region_end >= transl_region_end): transl_region_end = this_translation_region_end return
def check_alt_splices (cursor, species, ensembl_db_name): print "############################" print 'checking alt splicing in ', species qry = "use " + ensembl_db_name[species] search_db(cursor, qry) gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) no_cover_and_no_seq = 0 no_cover_and_no_dna_seq = 0 no_seq_info_in_database = 0 all_ok = 0 cover_already_present = 0 cover_not_in_exon_set = 0 cov_exon_not_in_ccds = 0 genes_w_ccds = 0 tot_exons = 0 #for gene_id in gene_ids[:100]: #for gene_id in [413198]: for count in range(1000): gene_id = choice (gene_ids) #print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) transcript_ids = get_transcript_ids(cursor, gene_id) tr_w_ccds = [] for [tr_id, tr_stable] in transcript_ids: ccds = check_ccds (cursor, tr_stable) if not ccds: continue tr_w_ccds.append([tr_id, tr_stable]) if not tr_w_ccds: continue genes_w_ccds += 1 # get all exons for this gene all_exons = gene2exon_list (cursor, gene_id) exons_w_ccds = set([]) # get the unique_ids # find exons which are on the ccds list for [tr_id, tr_stable] in tr_w_ccds: exon_ids = transcript_id2exon_ids (cursor, tr_id) exons_w_ccds.update( set(exon_ids)) # for these exons check sequence is_known=1 for exon_id in exons_w_ccds: tot_exons += 1 exon = get_exon (cursor, exon_id, is_known) seq = get_exon_seqs (cursor, exon_id, is_known) if not seq: #print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) #print exon_id, " no seq", "covered: ", exon.covering_exon #exit (1) no_seq_info_in_database += 1 continue [exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = seq #print exon_id, exon_seq_id, #print " %7d %7d %7d " % (exon.start_in_gene, exon.end_in_gene, exon.covering_exon), if exon.covering_exon < 0: #print protein_seq if not protein_seq: no_cover_and_no_seq += 1 elif not dna_seq: no_cover_and_no_dna_seq += 1 else: all_ok += 1 else: if exon.covering_exon_known and exon.covering_exon in exons_w_ccds: #print " <<<< " cover_already_present += 1 else: all_exon_ids = map(lambda exon: exon.exon_id, all_exons) if not exon.covering_exon in all_exon_ids: cover_not_in_exon_set += 1 print "cover_not_in_exon_set: " print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) print exon_id, exon_seq_id, print " %7d %7d %7d " % (exon.start_in_gene, exon.end_in_gene, exon.covering_exon) #print covering_exon print " ************" for e in all_exons: print e exit (1) elif not exon.covering_exon in exons_w_ccds: cov_exon_not_in_ccds += 1 #print "covering exon is not in ccds set " print "genes_w_ccds", genes_w_ccds print "tot_exons", tot_exons print "no_seq_info_in_database ", no_seq_info_in_database print "all_ok", all_ok print "cover_already_present ", cover_already_present print "no_cover_and_no_seq ", no_cover_and_no_seq print "no_cover_and_no_dna_seq ", no_cover_and_no_dna_seq print "cov_exon_not_in_ccds", cov_exon_not_in_ccds print "cover_not_in_exon_set ", cover_not_in_exon_set return True
def check_alt_splices(cursor, species, ensembl_db_name): print "############################" print 'checking alt splicing in ', species qry = "use " + ensembl_db_name[species] search_db(cursor, qry) gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1) no_cover_and_no_seq = 0 no_cover_and_no_dna_seq = 0 no_seq_info_in_database = 0 all_ok = 0 cover_already_present = 0 cover_not_in_exon_set = 0 cov_exon_not_in_ccds = 0 genes_w_ccds = 0 tot_exons = 0 #for gene_id in gene_ids[:100]: #for gene_id in [413198]: for count in range(1000): gene_id = choice(gene_ids) #print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) transcript_ids = get_transcript_ids(cursor, gene_id) tr_w_ccds = [] for [tr_id, tr_stable] in transcript_ids: ccds = check_ccds(cursor, tr_stable) if not ccds: continue tr_w_ccds.append([tr_id, tr_stable]) if not tr_w_ccds: continue genes_w_ccds += 1 # get all exons for this gene all_exons = gene2exon_list(cursor, gene_id) exons_w_ccds = set([]) # get the unique_ids # find exons which are on the ccds list for [tr_id, tr_stable] in tr_w_ccds: exon_ids = transcript_id2exon_ids(cursor, tr_id) exons_w_ccds.update(set(exon_ids)) # for these exons check sequence is_known = 1 for exon_id in exons_w_ccds: tot_exons += 1 exon = get_exon(cursor, exon_id, is_known) seq = get_exon_seqs(cursor, exon_id, is_known) if not seq: #print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) #print exon_id, " no seq", "covered: ", exon.covering_exon #exit (1) no_seq_info_in_database += 1 continue [ exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq ] = seq #print exon_id, exon_seq_id, #print " %7d %7d %7d " % (exon.start_in_gene, exon.end_in_gene, exon.covering_exon), if exon.covering_exon < 0: #print protein_seq if not protein_seq: no_cover_and_no_seq += 1 elif not dna_seq: no_cover_and_no_dna_seq += 1 else: all_ok += 1 else: if exon.covering_exon_known and exon.covering_exon in exons_w_ccds: #print " <<<< " cover_already_present += 1 else: all_exon_ids = map(lambda exon: exon.exon_id, all_exons) if not exon.covering_exon in all_exon_ids: cover_not_in_exon_set += 1 print "cover_not_in_exon_set: " print gene_id, gene2stable(cursor, gene_id), get_description( cursor, gene_id) print exon_id, exon_seq_id, print " %7d %7d %7d " % (exon.start_in_gene, exon.end_in_gene, exon.covering_exon) #print covering_exon print " ************" for e in all_exons: print e exit(1) elif not exon.covering_exon in exons_w_ccds: cov_exon_not_in_ccds += 1 #print "covering exon is not in ccds set " print "genes_w_ccds", genes_w_ccds print "tot_exons", tot_exons print "no_seq_info_in_database ", no_seq_info_in_database print "all_ok", all_ok print "cover_already_present ", cover_already_present print "no_cover_and_no_seq ", no_cover_and_no_seq print "no_cover_and_no_dna_seq ", no_cover_and_no_dna_seq print "cov_exon_not_in_ccds", cov_exon_not_in_ccds print "cover_not_in_exon_set ", cover_not_in_exon_set return True
def feed_trivial_names (cursor, all_species): tax_id = {} trivial = {} trivial['ailuropoda_melanoleuca'] = 'panda' trivial['anas_platyrhynchos'] = 'duck' trivial['anolis_carolinensis'] = 'anole_lizard' trivial['astyanax_mexicanus'] = 'blind_cavefish' trivial['bos_taurus'] = 'cow' trivial['callithrix_jacchus'] = 'marmoset' trivial['canis_familiaris'] = 'dog' trivial['cavia_porcellus'] = 'guinea_pig' trivial['choloepus_hoffmanni'] = 'sloth' trivial['danio_rerio'] = 'zebrafish' trivial['dasypus_novemcinctus'] = 'armadillo' trivial['dipodomys_ordii'] = 'kangaroo_rat' trivial['echinops_telfairi'] = 'madagascar_hedgehog' trivial['equus_caballus'] = 'horse' trivial['erinaceus_europaeus'] = 'european_hedgehog' trivial['felis_catus'] = 'cat' trivial['ficedula_albicollis'] = 'flycatcher' trivial['gadus_morhua'] = 'cod' trivial['gallus_gallus'] = 'chicken' trivial['gasterosteus_aculeatus'] = 'stickleback' trivial['gorilla_gorilla'] = 'gorilla' trivial['homo_sapiens'] = 'human' trivial['ictidomys_tridecemlineatus'] = 'squirrel' trivial['latimeria_chalumnae'] = 'coelacanth' trivial['lepisosteus_oculatus'] = 'spotted_gar' trivial['loxodonta_africana'] = 'elephant' trivial['macaca_mulatta'] = 'macaque' trivial['macropus_eugenii'] = 'wallaby' trivial['meleagris_gallopavo'] = 'turkey' trivial['microcebus_murinus'] = 'lemur' trivial['monodelphis_domestica'] = 'opossum' trivial['mus_musculus'] = 'mouse' trivial['mustela_putorius_furo'] = 'ferret' trivial['myotis_lucifugus'] = 'bat' trivial['nomascus_leucogenys'] = 'gibbon' trivial['ochotona_princeps'] = 'pika' trivial['oreochromis_niloticus'] = 'tilapia' trivial['ornithorhynchus_anatinus'] = 'platypus' trivial['oryctolagus_cuniculus'] = 'rabbit' trivial['oryzias_latipes'] = 'medaka' trivial['otolemur_garnettii'] = 'galago_lemur' trivial['ovis_aries'] = 'sheep' trivial['pan_troglodytes'] = 'chimpanzee' trivial['papio_anubis'] = 'baboon' trivial['pelodiscus_sinensis'] = 'turtle' trivial['petromyzon_marinus'] = 'lamprey' trivial['poecilia_formosa'] = 'amazon_molly' trivial['pongo_abelii'] = 'orangutan' trivial['procavia_capensis'] = 'hyrax' trivial['pteropus_vampyrus'] = 'flying_fox' trivial['rattus_norvegicus'] = 'rat' trivial['sarcophilus_harrisii'] = 'tasmanian_devil' trivial['sorex_araneus'] = 'european_shrew' trivial['sus_scrofa'] = 'pig' trivial['taeniopygia_guttata'] = 'zebra_finch' trivial['takifugu_rubripes'] = 'fugu' trivial['tarsius_syrichta'] = 'tarsier' trivial['tetraodon_nigroviridis'] = 'pufferfish' trivial['tupaia_belangeri'] = 'tree_shrew' trivial['tursiops_truncatus'] = 'dolphin' trivial['vicugna_pacos'] = 'alpaca' trivial['xenopus_tropicalis'] = 'xenopus' trivial['xiphophorus_maculatus'] = 'platyfish' db_name = get_compara_name (cursor) if (not db_name): print "compara db not found" exit(1) qry = "use %s " % db_name search_db (cursor, qry) for species in all_species: tax_id[species] = species2taxid (cursor, species) # switch to ncbi taxonomy database db_name = get_ncbi_tax_name (cursor) if (not db_name): print "ncbi taxonomy db not found" exit(1) qry = "use %s " % db_name search_db (cursor, qry) for species in all_species: if trivial.has_key(species): fixed_fields = {} update_fields = {} fixed_fields ['tax_id'] = tax_id[species] fixed_fields ['name_class'] = 'trivial' update_fields['name_txt'] = trivial[species] store_or_update (cursor, 'names', fixed_fields, update_fields) else: print "trivial for ", species, " not found " trivial[species] = "" return True
def feed_name_shorthands (cursor, all_species): short = {} short['ailuropoda_melanoleuca'] = 'AME' short['anas_platyrhynchos'] = 'APL' short['anolis_carolinensis'] = 'ACA' short['astyanax_mexicanus'] = 'AMX' short['bos_taurus'] = 'BTA' short['callithrix_jacchus'] = 'CJA' short['canis_familiaris'] = 'CAF' short['cavia_porcellus'] = 'CPO' short['choloepus_hoffmanni'] = 'CHO' short['danio_rerio'] = 'DAR' short['dasypus_novemcinctus'] = 'DNO' short['dipodomys_ordii'] = 'DOR' short['echinops_telfairi'] = 'ETE' short['equus_caballus'] = 'ECA' short['erinaceus_europaeus'] = 'EEU' short['felis_catus'] = 'FCA' short['ficedula_albicollis'] = 'FAL' short['gadus_morhua'] = 'GMO' short['gallus_gallus'] = 'GAL' short['gasterosteus_aculeatus'] = 'GAC' short['gorilla_gorilla'] = 'GGO' short['homo_sapiens'] = '' short['ictidomys_tridecemlineatus'] = 'STO' short['latimeria_chalumnae'] = 'LAC' short['lepisosteus_oculatus'] = 'LOC' short['loxodonta_africana'] = 'LAF' short['macaca_mulatta'] = 'MMU' short['macropus_eugenii'] = 'MEU' short['meleagris_gallopavo'] = 'MGA' short['microcebus_murinus'] = 'MIC' short['monodelphis_domestica'] = 'MOD' short['mus_musculus'] = 'MUS' short['mustela_putorius_furo'] = 'MPU' short['myotis_lucifugus'] = 'MLU' short['nomascus_leucogenys'] = 'NLE' short['ochotona_princeps'] = 'OPR' short['oreochromis_niloticus'] = 'ONI' short['ornithorhynchus_anatinus'] = 'OAN' short['oryctolagus_cuniculus'] = 'OCU' short['oryzias_latipes'] = 'ORL' short['ovis_aries'] = 'OAR' short['otolemur_garnettii'] = 'OGA' short['pan_troglodytes'] = 'PTR' short['papio_anubis'] = 'PAN' short['poecilia_formosa'] = 'PFO' short['pelodiscus_sinensis'] = 'PSI' short['petromyzon_marinus'] = 'PMA' short['pongo_abelii'] = 'PPY' short['procavia_capensis'] = 'PCA' short['pteropus_vampyrus'] = 'PVA' short['rattus_norvegicus'] = 'RNO' short['sarcophilus_harrisii'] = 'SHA' short['sorex_araneus'] = 'SAR' short['sus_scrofa'] = 'SSC' short['taeniopygia_guttata'] = 'TGU' short['takifugu_rubripes'] = 'TRU' short['tarsius_syrichta'] = 'TSY' short['tetraodon_nigroviridis'] = 'TNI' short['tupaia_belangeri'] = 'TBE' short['tursiops_truncatus'] = 'TTR' short['vicugna_pacos'] = 'VPA' short['xenopus_tropicalis'] = 'XET' short['xiphophorus_maculatus'] = 'XMA' db_name = get_compara_name (cursor) qry = "use %s " % db_name search_db (cursor, qry) table = 'species_name_shorthands' # if the table does not exist, make it if not check_table_exists (cursor, db_name, table): qry = "CREATE TABLE " + table + " (id INT(10) PRIMARY KEY AUTO_INCREMENT)" rows = search_db (cursor, qry) if (rows): return False qry = "ALTER TABLE %s ADD %s VARCHAR(100)" % (table, 'species') rows = search_db (cursor, qry) if (rows): return False qry = "ALTER TABLE %s ADD %s VARCHAR(10)" % (table, 'shorthand') rows = search_db (cursor, qry) if (rows): return False for species in all_species: if short.has_key(species): fixed_fields = {} update_fields = {} fixed_fields ['species'] = species update_fields ['shorthand'] = short[species] store_or_update (cursor, table, fixed_fields, update_fields) else: print "short for ", species, " not found " short[species] = ""
def main(): parameter = {} # in case I ever have to handle multiple versions of ensembl # (but for now I don't have enough space) # note though that there are functions in el_utils/mysql.py that assume # that whatever ensembl stuff is available to the mysql server corresponds to the same release release_number = '76' parameter['ensembl_release_number'] = release_number parameter['blastp_e_value'] = "1.e-10" # it will be used as a string when fmting the blastp cmd parameter['min_accptbl_exon_sim'] = 0.33333 #minimum acceptable exon similarity dir_path = {} dir_path['ensembl_fasta'] = '/mnt/ensembl-mirror/release-'+release_number+'/fasta' # local juggling of data from one database base to the other dir_path['afs_dumps'] = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/' dir_path['afs_dumps'] += 'ExoLocator/results/dumpster' dir_path['resources'] = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/' dir_path['resources'] += 'pypeworks/exolocator/resources' dir_path['scratch'] = '/tmp' dir_path['maxentscan'] = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/' dir_path['maxentscan'] += 'pypeworks/exolocator/pl_utils/maxentscan' util_path = {} util_path['mafft'] = '/usr/bin/mafft' util_path['blastall'] = '/usr/bin/blastall' util_path['fastacmd'] = '/usr/bin/fastacmd' util_path['sw#'] = '/usr/bin/swsharp' util_path['usearch'] = '/usr/bin/usearch' util_path['score3'] = dir_path['maxentscan'] + '/score3.pl' util_path['score5'] = dir_path['maxentscan'] + '/score5.pl' if 1: # check if the paths are functioning (at this point at least) for util in util_path.values(): if (not os.path.exists(util)): print util, " not found " sys.exit (1) for dir in dir_path.values(): if (not os.path.exists(dir)): print dir, " not found " sys.exit (1) if (not os.path.isdir (dir)): print dir, " is not a directory " sys.exit (1) db = connect_to_mysql() cursor = db.cursor() ####################################################### # check if the config db exists -- if not, make it db_name = "exolocator_config" qry = "show databases like'%s'" % db_name rows = search_db (cursor, qry) if (not rows): print db_name, "database not found" qry = "create database %s " % db_name rows = search_db (cursor, qry) if (rows): print "some problem creating the database ..." rows = search_db (cursor, qry, verbose = True) else: print db_name, "database found" qry = "use %s " % db_name search_db (cursor, qry) # make tables for table in ['util_path', 'dir_path', 'parameter']: if ( check_table_exists (cursor, db_name, table)): print table, " found in ", db_name else: print table, " not found in ", db_name make_table (cursor, table) # fill util, dir and path tables fixed_fields = {} update_fields = {} for [name, path] in util_path.iteritems(): fixed_fields['name'] = name update_fields['path'] = path store_or_update (cursor, 'util_path', fixed_fields, update_fields) fixed_fields = {} update_fields = {} for [name, path] in dir_path.iteritems(): fixed_fields['name'] = name update_fields['path'] = path store_or_update (cursor, 'dir_path', fixed_fields, update_fields) fixed_fields = {} update_fields = {} for [name, value] in parameter.iteritems(): fixed_fields['name'] = name update_fields['value'] = value store_or_update (cursor, 'parameter', fixed_fields, update_fields) ####################################################### # add trivial names to ncbi_taxonomy.names [all_species, ensembl_db_name] = get_species (cursor) feed_trivial_names (cursor, all_species) ####################################################### # add species shorthands (used in ENS* names formation) # though we will not needed unit the paralogue alignment reconstruction point) feed_name_shorthands (cursor, all_species) cursor.close() db.close()