def create_SQ_header(database_v,species):
    '''
    :param database_v: database version
    :return: list of chromosomes with their size (from ENSEMBL)
    '''
    SQ=[]
    # create connection to ensembm database
        # create connection to ensembl database
    if species=="arabidopsis_thaliana":
        Genome_species=Species.getCommonName(species.replace('_',' '))
        account=host.HostAccount(host="mysql-eg-publicsql.ebi.ac.uk",user="******",passwd="",port=4157)
        ensembl=Genome(Species=Genome_species,account=account,Release=30)
    else:
        Genome_species=Species.getCommonName(species.replace('_',' '))
        ensembl=Genome(Species=Genome_species,Release=database_v,account=None)

    # convert IDs
    coord_table=ensembl.CoreDb.getTable('coord_system')
    seq_region_table=ensembl.CoreDb.getTable('seq_region')
    select_obj=[seq_region_table.c.name,
                seq_region_table.c.length,
                coord_table.c.version,
                ]
    from_obj=seq_region_table.join(coord_table,coord_table.c.coord_system_id==seq_region_table.c.coord_system_id)

    query = sql.select(select_obj,from_obj=[from_obj],
                           whereclause = coord_table.c.rank==1)

    for row in query.execute():
        if '_' not in row[0]:
            SQ_string= "@SQ\tSN:chr"+str(row[0])+"\tLN:"+str(row[1])+"\tAS:"+str(row[2])+"\tSP:"+str(species)
            SQ.append(SQ_string)
    return SQ
def prepareAnnotationENSEMBL(psm_protein_id,mode,database_v,species,three_frame_translation):
    '''
    :param psm_protein_id: list of protein IDs (untagged)
    :param mode: transcript or protein mode
    :param database_v: database version
    :param species: species name
    :return: dictionairy mapping proteins into ENSEMBL
    '''
    print('Commencing ENSEMBL data retrieval')
    # create connection to ensembl database
    if species=="arabidopsis_thaliana":
        Genome_species=Species.getCommonName(species.replace('_',' '))
        account=host.HostAccount(host="mysql-eg-publicsql.ebi.ac.uk",user="******",passwd="",port=4157)
        ensembl=Genome(Species=Genome_species,account=account,Release=30)
    else:
        Genome_species=Species.getCommonName(species.replace('_',' '))
        ensembl=Genome(Species=Genome_species,Release=database_v,account=None)


    # convert IDs
    translation_table=ensembl.CoreDb.getTable('translation')
    transcript_table=ensembl.CoreDb.getTable('transcript')
    select_obj=[transcript_table.c.stable_id,
                translation_table.c.stable_id,
                transcript_table.c.transcript_id,
                translation_table.c.seq_start,
                translation_table.c.start_exon_id,
                ]
    from_obj=translation_table.join(transcript_table,transcript_table.c.transcript_id==translation_table.c.transcript_id)

    if mode=='protein':
        id=1
        query = sql.select(select_obj,from_obj=[from_obj],
                           whereclause = translation_table.c.stable_id.in_(psm_protein_id))

    elif mode=='transcript':
        id=0
        query = sql.select(select_obj,from_obj=[from_obj],
                           whereclause = transcript_table.c.stable_id.in_(psm_protein_id))
    psm_protein_id={}
    transcript_ids=[]
    for row in query.execute():
        #print row
        transcript_ids.append(row[2])
        psm_protein_id[row[id]]={'transcript_id':row[0],'translation_id':row[1],
                                 'transcript_seq':'','protein_seq':'',
                                 'chr':'','strand':'','5UTR_offset':row[3],'start_exon_rank':row[4]}
    return ensembl_construct_sequences(psm_protein_id,ensembl,transcript_ids,database_v,species,
                                       three_frame_translation,mode)
Beispiel #3
0
def setupGenome( species, db_host=None, db_user=None, db_pass=None, db_release=None ):
    """ 
    Notes:
        setup the ensembl_genome object using pycogent, this object has methods described here: http://pycogent.org/examples/query_ensembl.html
    
    Args:
        species     = 'Anopheles gambiae'   #string: this needs to match the mysql databases in vbdev: >mysql -hlocalhost -uvbuser -pSavvas
        db_host     = 'localhost'           #string: ^^
        db_user     = '******'              #string: ^^
        db_pass     = '******'              #string: ^^
        db_release  = 73                    #integer: the realease versions we use are all 73

    """
    if not db_host:
        account=None
    if not db_release:
        db_release=73
    from cogent.db.ensembl import HostAccount, Genome, Species
    account = HostAccount(db_host,db_user,db_pass)
    species_latin = species.replace('_',' ')
    Species.amendSpecies(species_latin,species_latin)
    genome = Genome(Species=species_latin,Release=db_release,account=account)
    return genome
Beispiel #4
0
from __future__ import absolute_import, print_function
import numpy as np
import pandas as pd
from . import base

try:
    import cogent
    from cogent.db.ensembl import HostAccount, Genome, Compara, Species
    account = None
    release = '87'
    #add recent ensembl species
    species = [
        'cow', 'human', 'mouse', 'rat', 'chimp', 'gorilla', 'orangutan',
        'macaque', 'dog', 'pig', 'cat', 'olive baboon', 'sheep'
    ]
    Species.amendSpecies('Papio anubis', 'olive baboon')
    Species.amendSpecies('Ovis aries', 'sheep')
    Species.amendSpecies('Erinaceus europaeus', 'hedgehog')
    Species.amendSpecies('Mustela putorius furo', 'ferret')
except Exception as e:
    print(e)


def get_orthologs(refgenome, ensid=None, symbol=None):
    '''if ensid!=None:
        mygene = cow.getGeneByStableId(StableId=ensid)
    else:
        mygene = cow.getGenesMatching(symbol=symbol)'''
    #print mygene
    orthologs = comp.getRelatedGenes(gene_region=mygene,
                                     Relationship='ortholog_one2one')
Beispiel #5
0
def download_database_pycogent(species, release, database_name='ensembl', nucleotide=False) :
    log = get_log()

    #try :
    import cogent
    from cogent.db.ensembl import Species, Genome, Compara, HostAccount
    from cogent.db.ensembl.database import Database

    #except ImportError :
    #    log.fatal("pycogent import failed, exiting...")
    #    exit(1)

    if cogent.version_info != (1,5,3) :
        log.warning("only tested with pycogent version 1.5.3 (you are running %s)" % cogent.version)


    release, db_name, db_details = get_missing_info(species, release, database_name)

    account = HostAccount(
                db_details['hostname'],
                db_details['username'],
                db_details['password'],
                port=db_details['port'])

    if Species.getSpeciesName(species) == 'None' : # this is not an error, it returns the string "None"
        log.warning("%s not found in pycogent, attempting to add it manually" % species)
        Species.amendSpecies(species.capitalize().replace('_', ' '), species)

    genome = Genome(species, Release=release, account=account)
    compara = Compara([species], Release=release, account=account)



    # DON'T TRY THIS AT HOME!
    #
    # what happens is it searches for compara databases, but unfortunately finds more than one
    # in this situation pycogent just connects to the first one, which is always compara_bacteria
    # so one solution is to dig through all the compara objects internals to provide a connection
    # to the correct database ... obviously not the best solution, but at 6 lines of code definitely
    # the shortest ;-P
    #
    if db_name not in ('ensembl', 'bacteria') :
        log.warning("accessing compara from pycogent with species outside of ensembl-main and ensembl-bacteria is problematic, attempting to patch...")

        from cogent.db.ensembl.host import DbConnection
        from cogent.db.ensembl.name import EnsemblDbName
        import sqlalchemy

        new_db_name = EnsemblDbName(compara.ComparaDb.db_name.Name.replace('bacteria', db_name))
        compara.ComparaDb._db = DbConnection(account=account, db_name=new_db_name)
        compara.ComparaDb._meta = sqlalchemy.MetaData(compara.ComparaDb._db)
    # end of DON'T TRY THIS AT HOME!



    genes = set()
    families = []

    stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))

    for gene in genome.getGenesMatching(BioType='protein_coding') :
        stableid = gene.StableId

        # ignore genes that have already been seen as members of other gene families
        if stableid in genes :
            continue

        genes.add(stableid)

        paralogs = compara.getRelatedGenes(StableId=stableid, Relationship='within_species_paralog')
        
        current = []
        
        if paralogs is None :
            stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))
            current.append((stableid, str(gene.CanonicalTranscript.Cds) if nucleotide else str(gene.CanonicalTranscript.ProteinSeq)))

        else :
            for paralog in paralogs.Members :
                paralogid = paralog.StableId
                genes.add(paralogid)

                stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))

                try :
                    current.append((paralogid, str(paralog.CanonicalTranscript.Cds) if nucleotide else str(paralog.CanonicalTranscript.ProteinSeq)))
                
                except AttributeError :
                    log.fatal("pycogent did not find a canonical transcript for %s" % paralogid)
                    exit(1)

        #print ','.join([ i for i,j in current ])
        families.append(current)

    stderr.write("\r[downloading %s] got %d sequences\n" % ("CDS" if nucleotide else "protein", len(genes)))

    return families