def get_exon (cursor, exon_id, is_known=None, db_name=None): exon = Exon () if (db_name): if not switch_to_db(cursor, db_name): return exon if is_known==2: # sw# exon qry = "select * from sw_exon where exon_id = %d" % exon_id rows = search_db(cursor, qry, verbose=False) if (not rows): return exon exon.load_from_novel_exon (rows[0], "sw_exon") elif is_known==3: # sw# exon qry = "select * from usearch_exon where exon_id = %d" % exon_id rows = search_db(cursor, qry, verbose=False) if (not rows): return exon exon.load_from_novel_exon (rows[0], "usearch_exon") else: qry = "select * from gene2exon where exon_id = %d" % exon_id if is_known: qry += " and is_known = %s " % is_known rows = search_db(cursor, qry, verbose=False) if (not rows): return exon exon.load_from_gene2exon (rows[0]) return exon
def taxid2parentid (cursor, tax_id): qry = "select parent_tax_id from nodes where tax_id= %d " % tax_id rows = search_db (cursor, qry) if (not rows): rows = search_db (cursor, qry, verbose = True) return "" return rows[0][0]
def get_species (cursor): ensembl_db_name = {} all_species = [] # find the release number qry = "select value from exolocator_config.parameter where name = 'ensembl_release_number'" rows = search_db(cursor, qry) if not rows or 'error' in rows[0][0].lower(): print 'ensembl_release_number not set in exolocator_config' exit(1) release_number = rows[0][0] qry = "show databases like '%core_{0}%'".format(release_number) rows = search_db(cursor, qry) if (not rows): print "No databases with 'core_{0}' in the name found".format(release_number) exit(1) for row in rows: db_name = row[0] name_token = db_name.split ('_') species = name_token[0] i = 1 while not name_token[i] == 'core': species += "_"+ name_token[i] i += 1 ensembl_db_name[species] = db_name all_species.append(species) return all_species, ensembl_db_name
def get_alt_seq_info (cursor, gene_id, species): # seq identifier from gene table qry = "select seq_region_id, seq_region_start, seq_region_end, seq_region_strand from gene where gene_id = %d" % gene_id rows = search_db (cursor, qry) if not rows: return [] [seq_region_id, orig_seq_region_start, orig_seq_region_end, seq_region_strand] = rows[0] # check whether we have "assembly exception" # we do not want 'PAR' regions, though: ''' The pseudo-autosomal regions are homologous DNA sequences on the (human) X and Y chromosomes. They allow the pairing and crossing-over of these sex chromosomes the same way the autosomal chromosomes do during meiosis. As these genomic regions are identical between X and Y, they are oftentimes only stored once. The exception types we are interested in are PATCH_FIX and PATCH_NOVEL ''' qry = "select seq_region_start, seq_region_end, exc_seq_region_id, exc_seq_region_start, exc_seq_region_end " qry += "from assembly_exception where seq_region_id = %d " % seq_region_id qry += "and assembly_exception.exc_type like 'PATCH_%'" rows = search_db (cursor, qry) if not rows: return [] [seq_region_start, seq_region_end, exc_seq_region_id, exc_seq_region_start, exc_seq_region_end] = rows[0] qry = "select name, file_name from seq_region where seq_region_id= %d" % exc_seq_region_id rows = search_db (cursor, qry) if not rows: return [] [seq_name, file_names] = rows[0] mitochondrial = is_mitochondrial (cursor, gene_id) return [seq_name, file_names, seq_region_start, seq_region_end, seq_region_strand, mitochondrial]
def get_exon_pepseq(cursor, exon, db_name=None, verbose=False): if db_name: if not switch_to_db(cursor, db_name): return False if exon.exon_seq_id: exon_seq_id = exon.exon_seq_id qry = "select protein_seq " qry += " from exon_seq where exon_seq_id = %d" % exon_seq_id else: exon_id = exon.exon_id is_known = exon.is_known qry = "select protein_seq " qry += " from exon_seq where exon_id = %d and is_known = %d" % (exon_id, is_known) rows = search_db(cursor, qry) if not rows: if verbose: rows = search_db(cursor, qry, verbose=True) print rows return "" protein_seq = rows[0][0] if protein_seq is None: protein_seq = "" return protein_seq
def get_exon_pepseq (cursor, exon, db_name=None, verbose=False): if (db_name): if not switch_to_db(cursor, db_name): return False if exon.exon_seq_id: exon_seq_id = exon.exon_seq_id qry = "select protein_seq " qry += " from exon_seq where exon_seq_id = %d" % exon_seq_id else: exon_id = exon.exon_id is_known = exon.is_known qry = "select protein_seq " qry += " from exon_seq where exon_id = %d and is_known = %d" % (exon_id, is_known) rows = search_db(cursor, qry) if (not rows): if verbose: rows = search_db(cursor, qry, verbose = True) print rows return "" protein_seq = rows[0][0] if (protein_seq is None): protein_seq = "" return protein_seq
def is_reference(cursor, gene_id, non_ref_id, db_name=None): if db_name: qry = "use %s " % db_name rows = search_db(cursor, qry) if rows: rows = search_db(cursor, qry, verbose=True) print rows exit(1) qry = "select seq_region_id from gene where gene_id=%d" % int(gene_id) rows = search_db(cursor, qry) if not rows: return True seq_region_id = rows[0] qry = "select attrib_type_id from seq_region_attrib where seq_region_id=%s" % seq_region_id rows = search_db(cursor, qry) if not rows: return True for row in rows: attrib_type_id = int(row[0]) if attrib_type_id == non_ref_id: return False return True
def get_species(cursor): ensembl_db_name = {} all_species = [] # find the release number qry = "select value from exolocator_config.parameter where name = 'ensembl_release_number'" rows = search_db(cursor, qry) if not rows or "error" in rows[0][0].lower(): print "ensembl_release_number not set in exolocator_config" exit(1) release_number = rows[0][0] qry = "show databases like '%core_{0}%'".format(release_number) rows = search_db(cursor, qry) if not rows: print "No databases with 'core_{0}' in the name found".format(release_number) exit(1) for row in rows: db_name = row[0] name_token = db_name.split("_") species = name_token[0] i = 1 while not name_token[i] == "core": species += "_" + name_token[i] i += 1 ensembl_db_name[species] = db_name all_species.append(species) return all_species, ensembl_db_name
def gene2exon_list(cursor, gene_id, db_name=None, verbose=False): exons = [] if db_name: if not switch_to_db(cursor, db_name): return False qry = "select * from gene2exon where gene_id = %d " % gene_id rows = search_db(cursor, qry) if not rows: rows = search_db(cursor, "select database()") if verbose: print "database ", rows[0][0] rows = search_db(cursor, qry, verbose=True) print rows return [] for row in rows: exon = Exon() if not exon.load_from_gene2exon(row): continue exons.append(exon) return exons
def get_exon(cursor, exon_id, is_known=None, db_name=None): exon = Exon() if db_name: if not switch_to_db(cursor, db_name): return exon if is_known == 2: # sw# exon qry = "select * from sw_exon where exon_id = %d" % exon_id rows = search_db(cursor, qry, verbose=False) if not rows: return exon exon.load_from_novel_exon(rows[0], "sw_exon") elif is_known == 3: # sw# exon qry = "select * from usearch_exon where exon_id = %d" % exon_id rows = search_db(cursor, qry, verbose=False) if not rows: return exon exon.load_from_novel_exon(rows[0], "usearch_exon") else: qry = "select * from gene2exon where exon_id = %d" % exon_id if is_known: qry += " and is_known = %s " % is_known rows = search_db(cursor, qry, verbose=False) if not rows: return exon exon.load_from_gene2exon(rows[0]) return exon
def get_ncbi_tax_name(cursor): qry = "show databases like 'ncbi%tax%'" rows = search_db(cursor, qry) if (not rows): rows = search_db(cursor, qry, verbose=True) return "" return rows[0][0]
def get_ncbi_tax_name (cursor): qry = "show databases like 'ncbi%tax%'" rows = search_db (cursor, qry) if (not rows): rows = search_db (cursor, qry, verbose = True) return "" return rows[0][0]
def gene2exon_list (cursor, gene_id, db_name=None, verbose=False): exons = [] if (db_name): if not switch_to_db(cursor, db_name): return False qry = "select * from gene2exon where gene_id = %d " % gene_id rows = search_db(cursor, qry) if (not rows): rows = search_db(cursor, 'select database()') if verbose: print "database ", rows[0][0] rows = search_db(cursor, qry, verbose = True) print rows return [] for row in rows: exon = Exon() if (not exon.load_from_gene2exon(row)): continue exons.append(exon) return exons
def get_canonical_coordinates(cursor, canonical_transcript_id): qry = "select seq_start, start_exon_id, seq_end, end_exon_id " qry += " from translation where transcript_id = %d " % canonical_transcript_id rows = search_db(cursor, qry) if not rows: search_db(cursor, qry, verbose=True) return [] return rows[0]
def get_canonical_coordinates (cursor, canonical_transcript_id): qry = "select seq_start, start_exon_id, seq_end, end_exon_id " qry += " from translation where transcript_id = %d " % canonical_transcript_id rows = search_db (cursor, qry) if ( not rows): search_db (cursor, qry, verbose = True) return [] return rows[0]
def get_orthologues_from_species(cursor, ensembl_db_name, ortho_type, gene_member_id, species): # the ortho_type is one of the following: 'ortholog_one2one', # 'ortholog_one2many', 'ortholog_many2many', 'possible_ortholog', 'apparent_ortholog_one2one' orthos = [] # find genome db_id genome_db_id = species2genome_db_id (cursor, species) # make the cursor point to compara database - should be the responsibility of each function switch_to_db (cursor, get_compara_name (cursor)) qry = "select homology.homology_id from homology_member, homology " qry += " where homology_member.gene_member_id =%d " % gene_member_id qry += " and homology.homology_id = homology_member.homology_id " qry += " and homology.description = '%s' " % ortho_type rows = search_db (cursor, qry) if (not rows): return [] # no orthologs here # for each homology id find the other member id #print qry #print member_id, ortho_type, species, genome_db_id #print rows for row in rows: homology_id = row[0] #print "\t homology id:", homology_id switch_to_db (cursor, get_compara_name (cursor)) qry = "select gene_member_id from homology_member " qry += " where homology_id = %d" % int(homology_id) qry += " and not gene_member_id = %d" % gene_member_id rows2 = search_db (cursor, qry, verbose = False) if (not rows2): #print "\t ", #rows2 = search_db (cursor, qry, verbose = True) continue for row2 in rows2: ortho_id = row2[0] #print "\t\t ortho id:", ortho_id qry = "select stable_id from gene_member " qry += " where gene_member_id = %d " % ortho_id qry += " and genome_db_id = %d " % genome_db_id rows3 = search_db (cursor, qry, verbose = False) if (not rows3): #print "\t\t ", #rows3 = search_db (cursor, qry, verbose = True) continue ortho_stable = rows3[0][0] #print "\t\t ortho stable:", ortho_stable orthos.append(ortho_stable) if orthos: switch_to_db (cursor, ensembl_db_name [species]) orthos = map (lambda gene_id: stable2gene(cursor, gene_id), orthos) #print 'orthos:', orthos return orthos
def taxid2name(cursor, tax_id): switch_to_db(cursor, get_ncbi_tax_name(cursor)) qry = "select name_txt from names where tax_id= %d " % int(tax_id) qry += " and name_class = 'scientific name'" rows = search_db(cursor, qry) if (not rows): rows = search_db(cursor, qry, verbose=True) return "" return rows[0][0]
def taxid2trivial(cursor, tax_id): switch_to_db(cursor, get_ncbi_tax_name(cursor)) qry = "select name_txt from names where tax_id= %d " % int(tax_id) qry += " and name_class = 'trivial'" rows = search_db(cursor, qry) if (not rows or 'ERROR' in rows[0]): rows = search_db(cursor, qry, verbose=True) return "" return rows[0][0]
def get_orthologues_from_species(cursor, ensembl_db_name, ortho_type, gene_member_id, species): # the ortho_type is one of the following: 'ortholog_one2one', # 'ortholog_one2many', 'ortholog_many2many', 'possible_ortholog', 'apparent_ortholog_one2one' orthos = [] # find genome db_id genome_db_id = species2genome_db_id(cursor, species) # make the cursor point to compara database - should be the responsibility of each function switch_to_db(cursor, get_compara_name(cursor)) qry = "select homology.homology_id from homology_member, homology " qry += " where homology_member.gene_member_id =%d " % gene_member_id qry += " and homology.homology_id = homology_member.homology_id " qry += " and homology.description = '%s' " % ortho_type rows = search_db(cursor, qry) if not rows: return [] # no orthologs here # for each homology id find the other member id # print qry # print member_id, ortho_type, species, genome_db_id # print rows for row in rows: homology_id = row[0] # print "\t homology id:", homology_id switch_to_db(cursor, get_compara_name(cursor)) qry = "select gene_member_id from homology_member " qry += " where homology_id = %d" % int(homology_id) qry += " and not gene_member_id = %d" % gene_member_id rows2 = search_db(cursor, qry, verbose=False) if not rows2: # print "\t ", # rows2 = search_db (cursor, qry, verbose = True) continue for row2 in rows2: ortho_id = row2[0] # print "\t\t ortho id:", ortho_id qry = "select stable_id from gene_member " qry += " where gene_member_id = %d " % ortho_id qry += " and genome_db_id = %d " % genome_db_id rows3 = search_db(cursor, qry, verbose=False) if not rows3: # print "\t\t ", # rows3 = search_db (cursor, qry, verbose = True) continue ortho_stable = rows3[0][0] # print "\t\t ortho stable:", ortho_stable orthos.append(ortho_stable) if orthos: switch_to_db(cursor, ensembl_db_name[species]) orthos = map(lambda gene_id: stable2gene(cursor, gene_id), orthos) # print 'orthos:', orthos return orthos
def trivial2taxid (cursor, trivial_name): switch_to_db (cursor, get_ncbi_tax_name (cursor)) qry = "select tax_id from names where name_txt= '%s' " % trivial_name qry += " and name_class = 'trivial'"; rows = search_db (cursor, qry) if (not rows or 'ERROR' in rows[0]): rows = search_db (cursor, qry, verbose = True) return "" return int(rows[0][0])
def trivial2taxid(cursor, trivial_name): switch_to_db(cursor, get_ncbi_tax_name(cursor)) qry = "select tax_id from names where name_txt= '%s' " % trivial_name qry += " and name_class = 'trivial'" rows = search_db(cursor, qry) if (not rows or 'ERROR' in rows[0]): rows = search_db(cursor, qry, verbose=True) return "" return int(rows[0][0])
def taxid2trivial (cursor, tax_id): switch_to_db (cursor, get_ncbi_tax_name (cursor)) qry = "select name_txt from names where tax_id= %d " % int(tax_id) qry += " and name_class = 'trivial'"; rows = search_db (cursor, qry) if (not rows or 'ERROR' in rows[0]): rows = search_db (cursor, qry, verbose = True) return "" return rows[0][0]
def taxid2name (cursor, tax_id): switch_to_db (cursor, get_ncbi_tax_name (cursor)) qry = "select name_txt from names where tax_id= %d " % int(tax_id) qry += " and name_class = 'scientific name'"; rows = search_db (cursor, qry) if (not rows): rows = search_db (cursor, qry, verbose = True) return "" return rows[0][0]
def get_gene_ids(cursor, db_name=None, biotype=None, is_known=None, ref_only=False): gene_ids = [] if db_name: qry = "use %s " % db_name rows = search_db(cursor, qry) if rows: rows = search_db(cursor, qry, verbose=True) print rows exit(1) qry = "select gene_id from gene" if biotype or not is_known is None: qry += " where " if biotype: qry += "biotype='%s'" % biotype if biotype and not is_known is None: qry += " and " if not is_known is None: if is_known: qry += "status='known'" else: qry += "status!='known'" rows = search_db(cursor, qry, verbose=False) if not rows: rows = search_db(cursor, qry, verbose=True) return [] else: if "Error" in rows[0]: print rows[0] return [] # I don't want to hard code the id for the annotation "non_ref" # and I do no know where else to do it, so qe do it here if ref_only: qry = "select attrib_type_id from attrib_type where code='non_ref'" rows2 = search_db(cursor, qry) if not rows2 or not type(rows2[0][0]) is int: ref_only = False else: non_ref_id = int(rows2[0][0]) for row in rows: if not type(row[0]) is long: print row exit(1) gene_id = int(row[0]) if ref_only and not is_reference(cursor, gene_id, non_ref_id): continue gene_ids.append(gene_id) return gene_ids
def get_gene_ids (cursor, db_name=None, biotype = None, is_known = None, ref_only = False): gene_ids = [] if (db_name): qry = "use %s " % db_name rows = search_db (cursor, qry) if (rows): rows = search_db (cursor, qry, verbose = True) print rows exit (1) qry = "select gene_id from gene" if ( biotype or not is_known is None): qry += " where " if ( biotype): qry += "biotype='%s'" % biotype if (biotype and not is_known is None): qry += " and " if (not is_known is None): if is_known: qry += "status='known'" else: qry += "status!='known'" rows = search_db (cursor, qry, verbose = False) if (not rows): rows = search_db (cursor, qry, verbose = True) return [] else: if ('Error' in rows[0]): print rows[0] return [] # I don't want to hard code the id for the annotation "non_ref" # and I do no know where else to do it, so qe do it here if ref_only: qry = "select attrib_type_id from attrib_type where code='non_ref'" rows2 = search_db (cursor, qry) if not rows2 or not type(rows2[0][0]) is int: ref_only = False else: non_ref_id = int (rows2[0][0]) for row in rows: if ( not type(row[0]) is long ): print row exit(1) gene_id = int(row[0]) if ref_only and not is_reference(cursor, gene_id, non_ref_id): continue gene_ids.append(gene_id) return gene_ids
def get_species_shorthand(cursor, species): db_name = get_compara_name (cursor) qry = "use %s " % db_name search_db (cursor, qry) qry = "select shorthand from species_name_shorthands where species='%s'" % species rows = search_db (cursor, qry) if not rows: return "" return rows[0][0]
def member2stable(cursor, member_id): # member_id refers to compara db # of which we need to have one qry = "select stable_id from gene_member where gene_member_id = %d" % member_id rows = search_db(cursor, qry) if not rows: rows = search_db(cursor, qry, verbose=True) return "" return rows[0][0]
def member2stable (cursor, member_id): # member_id refers to compara db # of which we need to have one qry = "select stable_id from gene_member where gene_member_id = %d" % member_id rows = search_db (cursor, qry) if (not rows): rows = search_db (cursor, qry, verbose = True) return "" return rows[0][0]
def species2genome_db_id(cursor, species): switch_to_db(cursor, get_compara_name(cursor)) qry = "select genome_db_id from genome_db where name = '%s'" % species rows = search_db(cursor, qry) if not rows: search_db(cursor, qry, verbose=True) return 0 return int(rows[0][0])
def genome_db_id2species(cursor, genome_db_id): switch_to_db(cursor, get_compara_name(cursor)) qry = "select name from genome_db where genome_db_id = %d" % int(genome_db_id) rows = search_db(cursor, qry) if not rows: search_db(cursor, qry, verbose=True) return "" return rows[0][0]
def taxid2parentid (cursor, tax_id): switch_to_db (cursor, get_ncbi_tax_name (cursor)) qry = "select parent_tax_id from nodes where tax_id= %d " % int(tax_id) rows = search_db (cursor, qry) if (not rows): rows = search_db (cursor, qry, verbose = True) return "" try: retval = int(rows[0][0]) except: retval = "" return retval
def taxid2parentid(cursor, tax_id): switch_to_db(cursor, get_ncbi_tax_name(cursor)) qry = "select parent_tax_id from nodes where tax_id= %d " % int(tax_id) rows = search_db(cursor, qry) if (not rows): rows = search_db(cursor, qry, verbose=True) return "" try: retval = int(rows[0][0]) except: retval = "" return retval
def species2genome_db_id (cursor, species): switch_to_db (cursor, get_compara_name (cursor)) qry = "select genome_db_id from genome_db where name = '%s'" % species rows = search_db (cursor, qry) if (not rows): search_db (cursor, qry, verbose = True) return 0 return int(rows[0][0])
def genome_db_id2species (cursor, genome_db_id): switch_to_db (cursor, get_compara_name (cursor)) qry = "select name from genome_db where genome_db_id = %d" % int(genome_db_id) rows = search_db (cursor, qry) if (not rows): search_db (cursor, qry, verbose = True) return "" return rows[0][0]
def stable2member (cursor, stable_id): # member_id refers to compara db # of which we need to have one # qry = "select member_id from member where stable_id = '%s'" % stable_id # since version 76 the table is called gene_member, and the main id is gene_member_id qry = "select gene_member_id from gene_member where stable_id = '%s'" % stable_id rows = search_db (cursor, qry) if (not rows or 'ERROR' in rows[0]): rows = search_db (cursor, qry, verbose = True) exit(1) return "" return int(rows[0][0])
def species2taxid (cursor, species): switch_to_db (cursor, get_compara_name (cursor)) qry = "select taxon_id from genome_db where name = '%s'" % species rows = search_db (cursor, qry) if (not rows): search_db (cursor, qry, verbose = True) return "" try: retval = int(rows[0][0]) except: retval = "" return retval
def species2taxid(cursor, species): switch_to_db(cursor, get_compara_name(cursor)) qry = "select taxon_id from genome_db where name = '%s'" % species rows = search_db(cursor, qry) if not rows: search_db(cursor, qry, verbose=True) return "" try: retval = int(rows[0][0]) except: retval = "" return retval
def stable2member(cursor, stable_id): # member_id refers to compara db # of which we need to have one # qry = "select member_id from member where stable_id = '%s'" % stable_id # since version 76 the table is called gene_member, and the main id is gene_member_id qry = "select gene_member_id from gene_member where stable_id = '%s'" % stable_id rows = search_db(cursor, qry) if not rows or "ERROR" in rows[0]: rows = search_db(cursor, qry, verbose=True) exit(1) return "" return int(rows[0][0])
def gene2canon_transl(cursor, gene_id, db_name=None,): if (db_name and not switch_to_db(cursor, db_name)): return False qry = "select translation.translation_id from translation, gene " qry += " where gene.canonical_transcript_id = translation.transcript_id " qry += " and gene.gene_id = %d " % gene_id rows = search_db (cursor, qry, verbose = False) if (not rows): rows = search_db (cursor, qry, verbose = True) return "" return rows[0][0]
def gene2stable_canon_transl(cursor, gene_id, db_name=None): if db_name and not switch_to_db(cursor, db_name): return False qry = "select translation.stable_id from translation, gene " qry += " where gene.canonical_transcript_id = translation.transcript_id " qry += " and gene.gene_id = %d " % gene_id rows = search_db(cursor, qry, verbose=False) if not rows: rows = search_db(cursor, qry, verbose=True) return "" return rows[0][0]
def get_exon_seq_by_db_id(cursor, exon_seq_id, db_name=None): if db_name: if not switch_to_db(cursor, db_name): return False qry = "select exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, " qry += " left_flank, right_flank, dna_seq " qry += " from exon_seq where exon_seq_id = %d" % exon_seq_id rows = search_db(cursor, qry) if not rows: # rows = search_db(cursor, qry, verbose = True) return [] [exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = rows[0] if protein_seq is None: protein_seq = "" if left_flank is None: left_flank = "" if right_flank is None: right_flank = "" if dna_seq is None: dna_seq = "" return [exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq]
def stable2gene (cursor, stable_id=None, db_name=None): if (not stable_id): return 0 if (db_name and not switch_to_db(cursor, db_name)): return False qry = "select gene_id from gene where stable_id='%s'" % stable_id rows = search_db (cursor, qry, verbose = False) if (not rows): rows = search_db (cursor, qry, verbose = True) return 0 return int(rows[0][0])
def check_ccds(cursor, transcript_stable_id="", transcript_id=""): ccds = "" qry = "select dna_align_feature.hit_name " qry += "from dna_align_feature, transcript, transcript_supporting_feature " qry += " where dna_align_feature.dna_align_feature_id = transcript_supporting_feature.feature_id " qry += " and transcript_supporting_feature.feature_type ='dna_align_feature' " qry += " and transcript_supporting_feature.transcript_id =transcript.transcript_id " if transcript_stable_id: qry += " and transcript.stable_id = '%s' " % transcript_stable_id elif transcript_id: qry += " and transcript.transcript_id = '%s' " % transcript_id else: return ccds rows = search_db(cursor, qry) if not rows: return ccds for row in rows: if "CCDS" in row[0]: ccds = row[0] return ccds
def get_predicted_exons(cursor, gene_id, species): exons = [] # get the region on the gene ret = get_gene_region(cursor, gene_id) if ret: [gene_seq_id, gene_region_start, gene_region_end, gene_region_strand] = ret else: print "region not retrived for ", species, gene_id return [] qry = "SELECT * FROM prediction_exon WHERE seq_region_id = %d " % gene_seq_id qry += " AND seq_region_start >= %d AND seq_region_start <= %d " % (gene_region_start, gene_region_end) qry += " AND seq_region_end >= %d AND seq_region_end <= %d " % (gene_region_start, gene_region_end) rows = search_db(cursor, qry) if not rows: return [] for row in rows: exon = Exon() exon.gene_id = gene_id exon.load_from_ensembl_prediction(gene_region_start, gene_region_end, row) exons.append(exon) return exons
def get_exon_seq_by_db_id (cursor, exon_seq_id, db_name=None): if (db_name): if not switch_to_db(cursor, db_name): return False qry = "select exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, " qry += " left_flank, right_flank, dna_seq " qry += " from exon_seq where exon_seq_id = %d" % exon_seq_id rows = search_db(cursor, qry) if (not rows): #rows = search_db(cursor, qry, verbose = True) return [] [exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = rows[0] if (protein_seq is None): protein_seq = "" if (left_flank is None): left_flank = "" if (right_flank is None): right_flank = "" if (dna_seq is None): dna_seq = "" return [exon_seq_id, protein_seq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq]
def find_mammals(cursor, trivial_name_list): mammals = [] for trivial_name in trivial_name_list: switch_to_db(cursor, get_compara_name(cursor)) tax_id = trivial2taxid(cursor, trivial_name) parent_id = taxid2parentid(cursor, tax_id) tax_id = parent_id is_mammal = False while tax_id: qry = "select name_txt from names where tax_id= %d " % int(tax_id) qry += " and name_class = 'scientific name'" rows = search_db(cursor, qry) if rows and rows[0][0]: if 'mammal' in rows[0][0].lower(): is_mammal = True break elif 'vertebrat' in rows[0][0].lower(): # if the thing wasa mammal, we would have found it by now is_mammal = False break parent_id = taxid2parentid(cursor, tax_id) if parent_id and parent_id > 1: tax_id = parent_id else: tax_id = None if is_mammal: mammals.append(trivial_name) return mammals
def check_ccds (cursor, transcript_stable_id = "", transcript_id = ""): ccds = "" qry = "select dna_align_feature.hit_name " qry += "from dna_align_feature, transcript, transcript_supporting_feature " qry += " where dna_align_feature.dna_align_feature_id = transcript_supporting_feature.feature_id " qry += " and transcript_supporting_feature.feature_type ='dna_align_feature' " qry += " and transcript_supporting_feature.transcript_id =transcript.transcript_id " if (transcript_stable_id): qry += " and transcript.stable_id = '%s' " % transcript_stable_id elif (transcript_id): qry += " and transcript.transcript_id = '%s' " % transcript_id else: return ccds rows = search_db(cursor, qry) if not rows: return ccds for row in rows: if 'CCDS' in row[0]: ccds = row[0] return ccds
def stable2gene(cursor, stable_id=None, db_name=None): if not stable_id: return 0 if db_name and not switch_to_db(cursor, db_name): return False qry = "select gene_id from gene where stable_id='%s'" % stable_id rows = search_db(cursor, qry, verbose=False) if not rows: rows = search_db(cursor, qry, verbose=True) return 0 return int(rows[0][0])
def get_common_name(cursor, species): switch_to_db(cursor, get_compara_name(cursor)) tax_id = species2taxid(cursor, species) switch_to_db(cursor, get_ncbi_tax_name(cursor)) qry = "select name_txt from names where " qry += "tax_id = %d and " % tax_id qry += "name_class = 'genbank common name'" rows = search_db(cursor, qry) if rows: if ('ERROR' in rows[0]): search_db(cursor, qry, verbose=True) return "" else: return rows[0][0] return ""
def get_common_name (cursor, species): switch_to_db(cursor, get_compara_name(cursor)) tax_id = species2taxid (cursor, species) switch_to_db(cursor,get_ncbi_tax_name (cursor)) qry = "select name_txt from names where " qry += "tax_id = %d and " % tax_id qry += "name_class = 'genbank common name'" rows = search_db (cursor, qry) if rows: if ('ERROR' in rows[0]): search_db (cursor, qry, verbose = True) return "" else: return rows[0][0] return ""
def get_gene_coordinates(cursor, gene_id, db_name=None): if db_name: if not switch_to_db(cursor, db_name): return None qry = "select seq_region_id, seq_region_start, seq_region_end, seq_region_strand " qry += " from gene " qry += " where gene_id = %d" % gene_id rows = search_db(cursor, qry) if not rows or isinstance(rows[0], str) and "error" in rows[0].lower(): search_db(cursor, qry, verbose=True) return None return rows[0]
def get_predicted_exons (cursor, gene_id, species): exons = [] # get the region on the gene ret = get_gene_region (cursor, gene_id) if ret: [gene_seq_id, gene_region_start, gene_region_end, gene_region_strand] = ret else: print "region not retrived for ", species, gene_id return [] qry = "SELECT * FROM prediction_exon WHERE seq_region_id = %d " % gene_seq_id qry += " AND seq_region_start >= %d AND seq_region_start <= %d " % \ (gene_region_start, gene_region_end) qry += " AND seq_region_end >= %d AND seq_region_end <= %d " % \ (gene_region_start, gene_region_end) rows = search_db (cursor, qry) if (not rows): return [] for row in rows: exon = Exon() exon.gene_id = gene_id exon.load_from_ensembl_prediction (gene_region_start, gene_region_end, row) exons.append(exon) return exons
def find_mammals(cursor, trivial_name_list): mammals = [] for trivial_name in trivial_name_list: switch_to_db(cursor, get_compara_name(cursor)) tax_id = trivial2taxid (cursor, trivial_name) parent_id = taxid2parentid (cursor, tax_id) tax_id = parent_id is_mammal = False while tax_id: qry = "select name_txt from names where tax_id= %d " % int(tax_id) qry += " and name_class = 'scientific name'"; rows = search_db (cursor, qry) if rows and rows[0][0]: if 'mammal' in rows[0][0].lower(): is_mammal = True break elif 'vertebrat' in rows[0][0].lower(): # if the thing wasa mammal, we would have found it by now is_mammal = False break parent_id = taxid2parentid (cursor, tax_id) if parent_id and parent_id>1: tax_id = parent_id else: tax_id = None if is_mammal: mammals.append(trivial_name) return mammals
def get_gene_coordinates (cursor, gene_id, db_name=None): if (db_name): if not switch_to_db(cursor, db_name): return None qry = "select seq_region_id, seq_region_start, seq_region_end, seq_region_strand " qry += " from gene " qry += " where gene_id = %d" % gene_id rows = search_db (cursor, qry) if ( not rows or isinstance(rows[0], str) and 'error' in rows[0].lower()): search_db (cursor, qry, verbose = True) return None return rows[0]
def get_canonical_transcript_id (cursor, gene_id, db_name=None): if db_name and not switch_to_db(cursor, db_name): return [] qry = "select canonical_transcript_id" qry += " from gene where gene_id=%d" % gene_id rows = search_db (cursor, qry, verbose=False) if (not rows): rows = search_db (cursor, qry, verbose=True) return "" elif ( 'Error' in rows[0]): print rows[0] return "" return rows[0][0]
def get_gene_region (cursor, gene_id, is_known=None): qry = "select seq_region_id, seq_region_start, seq_region_end, " qry += " seq_region_strand " qry += " from gene where gene_id=%d" % gene_id if (not is_known is None and is_known): qry += " and status='known' " rows = search_db (cursor, qry, verbose=False) if (not rows): rows = search_db (cursor, qry, verbose=True) return [] elif ( 'Error' in rows[0]): print rows[0] return [] return rows[0]
def get_compara_name (cursor): # find the release number qry = "select value from exolocator_config.parameter where name = 'ensembl_release_number'" rows = search_db(cursor, qry) if not rows or 'error' in rows[0][0].lower(): print 'ensembl_release_number not set in exolocator_config' exit(1) release_number = rows[0][0] qry = "show databases like '%compara_{0}%'".format(release_number) rows = search_db (cursor, qry) if (not rows): rows = search_db (cursor, qry, verbose = True) return "" return rows[0][0]
def get_gene_region(cursor, gene_id, is_known=None): qry = "select seq_region_id, seq_region_start, seq_region_end, " qry += " seq_region_strand " qry += " from gene where gene_id=%d" % gene_id if not is_known is None and is_known: qry += " and status='known' " rows = search_db(cursor, qry, verbose=False) if not rows: rows = search_db(cursor, qry, verbose=True) return [] elif "Error" in rows[0]: print rows[0] return [] return rows[0]