Exemple #1
0
def compute_frequency(seq_id, codon_table, showtable):
    seq_id = seq_id.rstrip('\n')
    if not (seq_id.startswith("ENS") and len(seq_id) == 15):
        sys.stderr.write(seq_id + ' is not an Ensembl id' + '\n')
        return None
    client = ensembl.EnsemblRestClient()
    ref = client.get_sequence(seq_id)
    sequence = ref['seq']
    sequence = sequence.replace('T', 'U')

    #for each codon siblings points a list of other codons for the same aa
    siblings = dict(
        (cod, codgroup) for codgroup, aa in codon_table for cod in codgroup)

    cod_count, grp_count, freq = defaultdict(int), defaultdict(int), {}

    #counting each codon and each aa
    for cod in (sequence[i:i + 3] for i in xrange(0, len(sequence), 3)):
        if len(cod) == 3:
            cod_count[cod] += 1
            grp_count[siblings[cod]] += 1

    for cod in siblings.iterkeys():  # the keys of siblings are the 64 codons
        if siblings[
                cod] in grp_count:  #grp_count has value only if aa occurred
            freq[cod] = float(cod_count[cod]) / grp_count[siblings[cod]]
            freq[siblings[cod]] = float(grp_count[siblings[cod]]) / sum(
                grp_count.values())
        else:
            freq[cod] = '-* Missing *-'
            freq[siblings[cod]] = '-* Missing *-'
    if showtable == 'true':
        print "|-------------|-------------------|-----|------------------|-----------|-----------|---------|\n"+\
              "|  Amino-acid |   AA prob         |Codon|   Codon prob     | All AAs # | This AA # | Codon # |\n"+\
              "|-------------|-------------------|-----|------------------|-----------|-----------|---------|\n"
    display = '\n'.join('\n'.join(
        '%s   %-17s  %s   %-17s      %-9s   %-9s  %s' %
        (aa.rjust(13), freq[codgrp], cod.rjust(4), freq[cod],
         sum(grp_count.values()), grp_count[siblings[cod]], cod_count[cod])
        for i, cod in enumerate(codgrp) if grp_count[siblings[cod]] > 0)
                        for codgrp, aa in codon_table)
    print display
def cod_vr_mapping( pdb_chain_id, vr_filename, transcriptId, alignment ):
        # amino acids codes dictionary definition
        aaCodes = dict()
        aaCodes['G'] = 'Gly' #GLYCINE
        aaCodes['P'] = 'Pro' #PROLINE
        aaCodes['A'] = 'Ala' #ALANINE
        aaCodes['V'] = 'Val' #VALINE
        aaCodes['L'] = 'Leu' #LEUCINE
        aaCodes['I'] = 'Ile' #ISOLEUCINE
        aaCodes['M'] = 'Met' #METHIONINE
        aaCodes['C'] = 'Cys' #CYSTEINE
        aaCodes['F'] = 'Phe' #PHENYLALANINE
        aaCodes['Y'] = 'Tyr' #TYROSINE
        aaCodes['W'] = 'Trp' #TRYPTOPHAN
        aaCodes['H'] = 'His' #HISTIDINE
        aaCodes['K'] = 'Lys' #LYSINE
        aaCodes['R'] = 'Arg' #ARGININE
        aaCodes['Q'] = 'Gln' #GLUTAMINE
        aaCodes['N'] = 'Asn' #ASPARAGINE
        aaCodes['E'] = 'Glu' #GLUTAMIC ACID
        aaCodes['D'] = 'Asp' #ASPARTIC ACID
        aaCodes['S'] = 'Ser' #SERINE
        aaCodes['T'] = 'Thr' #THREONINE

        client = ensembl.EnsemblRestClient()

        ref = client.get_sequence( transcriptId , type="protein")
        ensembl_protein_seq = ref['seq']
        #print ensembl_protein_seq
        #print alignment
        ref = client.get_sequence( transcriptId , type="cds")
        cod_seq = ref['seq']
        cod_seq = cod_seq.replace('T','U')
        cod_seq = [cod_seq[i:i+3] for i in range(0, len(cod_seq), 3)]
        #print cod_seq

        if os.path.exists(vr_filename):
            with open(vr_filename) as vr_file:
                content = vr_file.readlines()
            vr_file.close()

            VR = []
            for line in content:
                VR.append(line.split())

            #print VR
            i = 0
            j = 0
            for k in alignment:
                    #print 'k:' + str(k) +' l:' + ensembl_protein_seq[j]
                    if j < len(ensembl_protein_seq):
                        if k == ensembl_protein_seq[j]:
                            i = i + 1
                            for row in VR:
                                if int(row[0]) == i:
                                    #print row
                                    print pdb_chain_id + ' ' + aaCodes[k] + ' ' + cod_seq[j] + ' ' +  row[0] + ' ' + row[1] + ' ' + row[2] + ' ' + row[3]
                        j = j + 1

            #pdbId = sys.argv[1]

            #parser = PDBParser()  #new parser
            #pdb = parser.get_structure(pdbId, pdbId + '.pdb')

        result = []
        return result
Exemple #3
0
import ensembl
import sys

if __name__ == '__main__':
    if len(sys.argv) > 3:
        species = sys.argv[1]
        region = sys.argv[2]
        features = sys.argv[3:]

        client = ensembl.EnsemblRestClient()
        print client.get_region_feature(species, region, features)
    else:
        print """This is a client for Ensembl REST API resource 'GET overlap/region/:species/:region'

                    Usage: python get_region_features.py [species] [region] [features]
                    For example:
                    python get_region_features.py human 7:140424943-140624564 gene exon

                    Available features : gene, transcript, cds, exon, repeat, simple, misc,
                                         variation, somatic_variation, structural_variation,
                                         somatic_structural_variation, constrained, regulatory,
                                         segmentation, motif, chipseq, array_probe
              """
Exemple #4
0
def pdb_seq_mapping( pdb_id ):
        # amino acids codes dictionary definition
        aaCodes = dict()
        aaCodes['GLY'] = 'G' #GLYCINE
        aaCodes['PRO'] = 'P' #PROLINE
        aaCodes['ALA'] = 'A' #ALANINE
        aaCodes['VAL'] = 'V' #VALINE
        aaCodes['LEU'] = 'L' #LEUCINE
        aaCodes['ILE'] = 'I' #ISOLEUCINE
        aaCodes['MET'] = 'M' #METHIONINE
        aaCodes['CYS'] = 'C' #CYSTEINE
        aaCodes['PHE'] = 'F' #PHENYLALANINE
        aaCodes['TYR'] = 'Y' #TYROSINE
        aaCodes['TRP'] = 'W' #TRYPTOPHAN
        aaCodes['HIS'] = 'H' #HISTIDINE
        aaCodes['LYS'] = 'K' #LYSINE
        aaCodes['ARG'] = 'R' #ARGININE
        aaCodes['GLN'] = 'Q' #GLUTAMINE
        aaCodes['ASN'] = 'N' #ASPARAGINE
        aaCodes['GLU'] = 'E' #GLUTAMIC ACID
        aaCodes['ASP'] = 'D' #ASPARTIC ACID
        aaCodes['SER'] = 'S' #SERINE
        aaCodes['THR'] = 'T' #THREONINE

        pdbId = pdb_id

        parser = PDBParser()  #new parser
        pdb = parser.get_structure(pdbId, pdbId + '.pdb')

        result = [] #empty list for storing results
        processed_chains = []

        for chain in pdb.get_chains():                 #iterating per each chain
            chainId = chain.get_id()                   #id
            if chainId not in processed_chains:
                processed_chains.append(chainId)
                residuesObj =  chain.get_unpacked_list()   #residues

                chainSeq =''                               #getting AA sequence for this chain
                                  #getting AA sequence for this chain
                for res in residuesObj:
                    if res.get_resname() in aaCodes.keys():
                        chainSeq = chainSeq + aaCodes[res.get_resname()]

                #getting uniprotid for this chain
                rcsb_url = 'http://www.rcsb.org/pdb/rest/customReport?pdbids=' + pdbId + '.' + chainId + '&customReportColumns=chainId,db_id,db_name&service=wsdisplay&format=text'
                headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }
                r = requests.get(rcsb_url, headers=headers)
                e = ET.ElementTree(ET.fromstring(r.content))
                root = e.getroot()

                uniprotId = ''
                for elem in root.getiterator('dimEntity.db_id'):
                    uniprotId = elem.text   #UniprotId found
                if uniprotId == '':         #this should never happen
                    print 'UniprotId not found !'
                    sys.exit()

                #translating uniprotId to Ensembl transcripts
                url = 'http://www.uniprot.org/mapping/'
                paramsStep2 = {
                'from':'ACC',
                'to':'ENSEMBL_TRS_ID',
                'format':'tab',
                'query':uniprotId
                }

                data = urllib.urlencode(paramsStep2)
                request = urllib2.Request(url, data)
                user = '******'
                request.add_header('User-Agent', user )
                try:
                    response = urllib2.urlopen(request)
                except urllib2.HTTPError, err:
                    if err.code == 503:
                        response = urllib2.urlopen(request)
                page = response.read()
                lines = page.split('\n')

                transcriptIds = []
                for line in lines:          #building list of transcripts for uniprotId
                    if len(line.split('\t')[0]) > 0 and (line.split('\t')[0]) <> 'From':
                        transcriptIds.append(line.split('\t')[1])

                #checking Ensembl reanscriptIds to find one with sequence that is the best match for processed chain
                client = ensembl.EnsemblRestClient()
                best_score = 0
                best_match_tid = ''
                best_match_seq = ''
                best_alignment = ''
                for transcriptId in transcriptIds:
                    ref = client.get_sequence( transcriptId , type="protein")
                    sequence = ref['seq']
                    alns = pairwise2.align.globalxs(chainSeq, sequence, -5, 0)
                    if alns[0][2] > best_score:
                        best_score = alns[0][2]
                        best_match_tid = transcriptId
                        best_match_seq = sequence
                        best_alignment = alns[0][0]
                result.append([pdbId + '.' + chainId, best_match_tid, best_alignment])