Esempio n. 1
0
def create_SQ_header(database_v,species):
    '''
    :param database_v: database version
    :return: list of chromosomes with their size (from ENSEMBL)
    '''
    SQ=[]
    # create connection to ensembm database
        # create connection to ensembl database
    if species=="arabidopsis_thaliana":
        Genome_species=Species.getCommonName(species.replace('_',' '))
        account=host.HostAccount(host="mysql-eg-publicsql.ebi.ac.uk",user="******",passwd="",port=4157)
        ensembl=Genome(Species=Genome_species,account=account,Release=30)
    else:
        Genome_species=Species.getCommonName(species.replace('_',' '))
        ensembl=Genome(Species=Genome_species,Release=database_v,account=None)

    # convert IDs
    coord_table=ensembl.CoreDb.getTable('coord_system')
    seq_region_table=ensembl.CoreDb.getTable('seq_region')
    select_obj=[seq_region_table.c.name,
                seq_region_table.c.length,
                coord_table.c.version,
                ]
    from_obj=seq_region_table.join(coord_table,coord_table.c.coord_system_id==seq_region_table.c.coord_system_id)

    query = sql.select(select_obj,from_obj=[from_obj],
                           whereclause = coord_table.c.rank==1)

    for row in query.execute():
        if '_' not in row[0]:
            SQ_string= "@SQ\tSN:chr"+str(row[0])+"\tLN:"+str(row[1])+"\tAS:"+str(row[2])+"\tSP:"+str(species)
            SQ.append(SQ_string)
    return SQ
Esempio n. 2
0
def prepareAnnotationENSEMBL(psm_protein_id,mode,database_v,species,three_frame_translation):
    '''
    :param psm_protein_id: list of protein IDs (untagged)
    :param mode: transcript or protein mode
    :param database_v: database version
    :param species: species name
    :return: dictionairy mapping proteins into ENSEMBL
    '''
    print('Commencing ENSEMBL data retrieval')
    # create connection to ensembl database
    if species=="arabidopsis_thaliana":
        Genome_species=Species.getCommonName(species.replace('_',' '))
        account=host.HostAccount(host="mysql-eg-publicsql.ebi.ac.uk",user="******",passwd="",port=4157)
        ensembl=Genome(Species=Genome_species,account=account,Release=30)
    else:
        Genome_species=Species.getCommonName(species.replace('_',' '))
        ensembl=Genome(Species=Genome_species,Release=database_v,account=None)


    # convert IDs
    translation_table=ensembl.CoreDb.getTable('translation')
    transcript_table=ensembl.CoreDb.getTable('transcript')
    select_obj=[transcript_table.c.stable_id,
                translation_table.c.stable_id,
                transcript_table.c.transcript_id,
                translation_table.c.seq_start,
                translation_table.c.start_exon_id,
                ]
    from_obj=translation_table.join(transcript_table,transcript_table.c.transcript_id==translation_table.c.transcript_id)

    if mode=='protein':
        id=1
        query = sql.select(select_obj,from_obj=[from_obj],
                           whereclause = translation_table.c.stable_id.in_(psm_protein_id))

    elif mode=='transcript':
        id=0
        query = sql.select(select_obj,from_obj=[from_obj],
                           whereclause = transcript_table.c.stable_id.in_(psm_protein_id))
    psm_protein_id={}
    transcript_ids=[]
    for row in query.execute():
        #print row
        transcript_ids.append(row[2])
        psm_protein_id[row[id]]={'transcript_id':row[0],'translation_id':row[1],
                                 'transcript_seq':'','protein_seq':'',
                                 'chr':'','strand':'','5UTR_offset':row[3],'start_exon_rank':row[4]}
    return ensembl_construct_sequences(psm_protein_id,ensembl,transcript_ids,database_v,species,
                                       three_frame_translation,mode)