def create_SQ_header(database_v,species): ''' :param database_v: database version :return: list of chromosomes with their size (from ENSEMBL) ''' SQ=[] # create connection to ensembm database # create connection to ensembl database if species=="arabidopsis_thaliana": Genome_species=Species.getCommonName(species.replace('_',' ')) account=host.HostAccount(host="mysql-eg-publicsql.ebi.ac.uk",user="******",passwd="",port=4157) ensembl=Genome(Species=Genome_species,account=account,Release=30) else: Genome_species=Species.getCommonName(species.replace('_',' ')) ensembl=Genome(Species=Genome_species,Release=database_v,account=None) # convert IDs coord_table=ensembl.CoreDb.getTable('coord_system') seq_region_table=ensembl.CoreDb.getTable('seq_region') select_obj=[seq_region_table.c.name, seq_region_table.c.length, coord_table.c.version, ] from_obj=seq_region_table.join(coord_table,coord_table.c.coord_system_id==seq_region_table.c.coord_system_id) query = sql.select(select_obj,from_obj=[from_obj], whereclause = coord_table.c.rank==1) for row in query.execute(): if '_' not in row[0]: SQ_string= "@SQ\tSN:chr"+str(row[0])+"\tLN:"+str(row[1])+"\tAS:"+str(row[2])+"\tSP:"+str(species) SQ.append(SQ_string) return SQ
def prepareAnnotationENSEMBL(psm_protein_id,mode,database_v,species,three_frame_translation): ''' :param psm_protein_id: list of protein IDs (untagged) :param mode: transcript or protein mode :param database_v: database version :param species: species name :return: dictionairy mapping proteins into ENSEMBL ''' print('Commencing ENSEMBL data retrieval') # create connection to ensembl database if species=="arabidopsis_thaliana": Genome_species=Species.getCommonName(species.replace('_',' ')) account=host.HostAccount(host="mysql-eg-publicsql.ebi.ac.uk",user="******",passwd="",port=4157) ensembl=Genome(Species=Genome_species,account=account,Release=30) else: Genome_species=Species.getCommonName(species.replace('_',' ')) ensembl=Genome(Species=Genome_species,Release=database_v,account=None) # convert IDs translation_table=ensembl.CoreDb.getTable('translation') transcript_table=ensembl.CoreDb.getTable('transcript') select_obj=[transcript_table.c.stable_id, translation_table.c.stable_id, transcript_table.c.transcript_id, translation_table.c.seq_start, translation_table.c.start_exon_id, ] from_obj=translation_table.join(transcript_table,transcript_table.c.transcript_id==translation_table.c.transcript_id) if mode=='protein': id=1 query = sql.select(select_obj,from_obj=[from_obj], whereclause = translation_table.c.stable_id.in_(psm_protein_id)) elif mode=='transcript': id=0 query = sql.select(select_obj,from_obj=[from_obj], whereclause = transcript_table.c.stable_id.in_(psm_protein_id)) psm_protein_id={} transcript_ids=[] for row in query.execute(): #print row transcript_ids.append(row[2]) psm_protein_id[row[id]]={'transcript_id':row[0],'translation_id':row[1], 'transcript_seq':'','protein_seq':'', 'chr':'','strand':'','5UTR_offset':row[3],'start_exon_rank':row[4]} return ensembl_construct_sequences(psm_protein_id,ensembl,transcript_ids,database_v,species, three_frame_translation,mode)
def setupGenome( species, db_host=None, db_user=None, db_pass=None, db_release=None ): """ Notes: setup the ensembl_genome object using pycogent, this object has methods described here: http://pycogent.org/examples/query_ensembl.html Args: species = 'Anopheles gambiae' #string: this needs to match the mysql databases in vbdev: >mysql -hlocalhost -uvbuser -pSavvas db_host = 'localhost' #string: ^^ db_user = '******' #string: ^^ db_pass = '******' #string: ^^ db_release = 73 #integer: the realease versions we use are all 73 """ if not db_host: account=None if not db_release: db_release=73 from cogent.db.ensembl import HostAccount, Genome, Species account = HostAccount(db_host,db_user,db_pass) species_latin = species.replace('_',' ') Species.amendSpecies(species_latin,species_latin) genome = Genome(Species=species_latin,Release=db_release,account=account) return genome
from __future__ import absolute_import, print_function import numpy as np import pandas as pd from . import base try: import cogent from cogent.db.ensembl import HostAccount, Genome, Compara, Species account = None release = '87' #add recent ensembl species species = [ 'cow', 'human', 'mouse', 'rat', 'chimp', 'gorilla', 'orangutan', 'macaque', 'dog', 'pig', 'cat', 'olive baboon', 'sheep' ] Species.amendSpecies('Papio anubis', 'olive baboon') Species.amendSpecies('Ovis aries', 'sheep') Species.amendSpecies('Erinaceus europaeus', 'hedgehog') Species.amendSpecies('Mustela putorius furo', 'ferret') except Exception as e: print(e) def get_orthologs(refgenome, ensid=None, symbol=None): '''if ensid!=None: mygene = cow.getGeneByStableId(StableId=ensid) else: mygene = cow.getGenesMatching(symbol=symbol)''' #print mygene orthologs = comp.getRelatedGenes(gene_region=mygene, Relationship='ortholog_one2one')
def download_database_pycogent(species, release, database_name='ensembl', nucleotide=False) : log = get_log() #try : import cogent from cogent.db.ensembl import Species, Genome, Compara, HostAccount from cogent.db.ensembl.database import Database #except ImportError : # log.fatal("pycogent import failed, exiting...") # exit(1) if cogent.version_info != (1,5,3) : log.warning("only tested with pycogent version 1.5.3 (you are running %s)" % cogent.version) release, db_name, db_details = get_missing_info(species, release, database_name) account = HostAccount( db_details['hostname'], db_details['username'], db_details['password'], port=db_details['port']) if Species.getSpeciesName(species) == 'None' : # this is not an error, it returns the string "None" log.warning("%s not found in pycogent, attempting to add it manually" % species) Species.amendSpecies(species.capitalize().replace('_', ' '), species) genome = Genome(species, Release=release, account=account) compara = Compara([species], Release=release, account=account) # DON'T TRY THIS AT HOME! # # what happens is it searches for compara databases, but unfortunately finds more than one # in this situation pycogent just connects to the first one, which is always compara_bacteria # so one solution is to dig through all the compara objects internals to provide a connection # to the correct database ... obviously not the best solution, but at 6 lines of code definitely # the shortest ;-P # if db_name not in ('ensembl', 'bacteria') : log.warning("accessing compara from pycogent with species outside of ensembl-main and ensembl-bacteria is problematic, attempting to patch...") from cogent.db.ensembl.host import DbConnection from cogent.db.ensembl.name import EnsemblDbName import sqlalchemy new_db_name = EnsemblDbName(compara.ComparaDb.db_name.Name.replace('bacteria', db_name)) compara.ComparaDb._db = DbConnection(account=account, db_name=new_db_name) compara.ComparaDb._meta = sqlalchemy.MetaData(compara.ComparaDb._db) # end of DON'T TRY THIS AT HOME! genes = set() families = [] stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) for gene in genome.getGenesMatching(BioType='protein_coding') : stableid = gene.StableId # ignore genes that have already been seen as members of other gene families if stableid in genes : continue genes.add(stableid) paralogs = compara.getRelatedGenes(StableId=stableid, Relationship='within_species_paralog') current = [] if paralogs is None : stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) current.append((stableid, str(gene.CanonicalTranscript.Cds) if nucleotide else str(gene.CanonicalTranscript.ProteinSeq))) else : for paralog in paralogs.Members : paralogid = paralog.StableId genes.add(paralogid) stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) try : current.append((paralogid, str(paralog.CanonicalTranscript.Cds) if nucleotide else str(paralog.CanonicalTranscript.ProteinSeq))) except AttributeError : log.fatal("pycogent did not find a canonical transcript for %s" % paralogid) exit(1) #print ','.join([ i for i,j in current ]) families.append(current) stderr.write("\r[downloading %s] got %d sequences\n" % ("CDS" if nucleotide else "protein", len(genes))) return families