def compute_frequency(seq_id, codon_table, showtable): seq_id = seq_id.rstrip('\n') if not (seq_id.startswith("ENS") and len(seq_id) == 15): sys.stderr.write(seq_id + ' is not an Ensembl id' + '\n') return None client = ensembl.EnsemblRestClient() ref = client.get_sequence(seq_id) sequence = ref['seq'] sequence = sequence.replace('T', 'U') #for each codon siblings points a list of other codons for the same aa siblings = dict( (cod, codgroup) for codgroup, aa in codon_table for cod in codgroup) cod_count, grp_count, freq = defaultdict(int), defaultdict(int), {} #counting each codon and each aa for cod in (sequence[i:i + 3] for i in xrange(0, len(sequence), 3)): if len(cod) == 3: cod_count[cod] += 1 grp_count[siblings[cod]] += 1 for cod in siblings.iterkeys(): # the keys of siblings are the 64 codons if siblings[ cod] in grp_count: #grp_count has value only if aa occurred freq[cod] = float(cod_count[cod]) / grp_count[siblings[cod]] freq[siblings[cod]] = float(grp_count[siblings[cod]]) / sum( grp_count.values()) else: freq[cod] = '-* Missing *-' freq[siblings[cod]] = '-* Missing *-' if showtable == 'true': print "|-------------|-------------------|-----|------------------|-----------|-----------|---------|\n"+\ "| Amino-acid | AA prob |Codon| Codon prob | All AAs # | This AA # | Codon # |\n"+\ "|-------------|-------------------|-----|------------------|-----------|-----------|---------|\n" display = '\n'.join('\n'.join( '%s %-17s %s %-17s %-9s %-9s %s' % (aa.rjust(13), freq[codgrp], cod.rjust(4), freq[cod], sum(grp_count.values()), grp_count[siblings[cod]], cod_count[cod]) for i, cod in enumerate(codgrp) if grp_count[siblings[cod]] > 0) for codgrp, aa in codon_table) print display
def cod_vr_mapping( pdb_chain_id, vr_filename, transcriptId, alignment ): # amino acids codes dictionary definition aaCodes = dict() aaCodes['G'] = 'Gly' #GLYCINE aaCodes['P'] = 'Pro' #PROLINE aaCodes['A'] = 'Ala' #ALANINE aaCodes['V'] = 'Val' #VALINE aaCodes['L'] = 'Leu' #LEUCINE aaCodes['I'] = 'Ile' #ISOLEUCINE aaCodes['M'] = 'Met' #METHIONINE aaCodes['C'] = 'Cys' #CYSTEINE aaCodes['F'] = 'Phe' #PHENYLALANINE aaCodes['Y'] = 'Tyr' #TYROSINE aaCodes['W'] = 'Trp' #TRYPTOPHAN aaCodes['H'] = 'His' #HISTIDINE aaCodes['K'] = 'Lys' #LYSINE aaCodes['R'] = 'Arg' #ARGININE aaCodes['Q'] = 'Gln' #GLUTAMINE aaCodes['N'] = 'Asn' #ASPARAGINE aaCodes['E'] = 'Glu' #GLUTAMIC ACID aaCodes['D'] = 'Asp' #ASPARTIC ACID aaCodes['S'] = 'Ser' #SERINE aaCodes['T'] = 'Thr' #THREONINE client = ensembl.EnsemblRestClient() ref = client.get_sequence( transcriptId , type="protein") ensembl_protein_seq = ref['seq'] #print ensembl_protein_seq #print alignment ref = client.get_sequence( transcriptId , type="cds") cod_seq = ref['seq'] cod_seq = cod_seq.replace('T','U') cod_seq = [cod_seq[i:i+3] for i in range(0, len(cod_seq), 3)] #print cod_seq if os.path.exists(vr_filename): with open(vr_filename) as vr_file: content = vr_file.readlines() vr_file.close() VR = [] for line in content: VR.append(line.split()) #print VR i = 0 j = 0 for k in alignment: #print 'k:' + str(k) +' l:' + ensembl_protein_seq[j] if j < len(ensembl_protein_seq): if k == ensembl_protein_seq[j]: i = i + 1 for row in VR: if int(row[0]) == i: #print row print pdb_chain_id + ' ' + aaCodes[k] + ' ' + cod_seq[j] + ' ' + row[0] + ' ' + row[1] + ' ' + row[2] + ' ' + row[3] j = j + 1 #pdbId = sys.argv[1] #parser = PDBParser() #new parser #pdb = parser.get_structure(pdbId, pdbId + '.pdb') result = [] return result
import ensembl import sys if __name__ == '__main__': if len(sys.argv) > 3: species = sys.argv[1] region = sys.argv[2] features = sys.argv[3:] client = ensembl.EnsemblRestClient() print client.get_region_feature(species, region, features) else: print """This is a client for Ensembl REST API resource 'GET overlap/region/:species/:region' Usage: python get_region_features.py [species] [region] [features] For example: python get_region_features.py human 7:140424943-140624564 gene exon Available features : gene, transcript, cds, exon, repeat, simple, misc, variation, somatic_variation, structural_variation, somatic_structural_variation, constrained, regulatory, segmentation, motif, chipseq, array_probe """
def pdb_seq_mapping( pdb_id ): # amino acids codes dictionary definition aaCodes = dict() aaCodes['GLY'] = 'G' #GLYCINE aaCodes['PRO'] = 'P' #PROLINE aaCodes['ALA'] = 'A' #ALANINE aaCodes['VAL'] = 'V' #VALINE aaCodes['LEU'] = 'L' #LEUCINE aaCodes['ILE'] = 'I' #ISOLEUCINE aaCodes['MET'] = 'M' #METHIONINE aaCodes['CYS'] = 'C' #CYSTEINE aaCodes['PHE'] = 'F' #PHENYLALANINE aaCodes['TYR'] = 'Y' #TYROSINE aaCodes['TRP'] = 'W' #TRYPTOPHAN aaCodes['HIS'] = 'H' #HISTIDINE aaCodes['LYS'] = 'K' #LYSINE aaCodes['ARG'] = 'R' #ARGININE aaCodes['GLN'] = 'Q' #GLUTAMINE aaCodes['ASN'] = 'N' #ASPARAGINE aaCodes['GLU'] = 'E' #GLUTAMIC ACID aaCodes['ASP'] = 'D' #ASPARTIC ACID aaCodes['SER'] = 'S' #SERINE aaCodes['THR'] = 'T' #THREONINE pdbId = pdb_id parser = PDBParser() #new parser pdb = parser.get_structure(pdbId, pdbId + '.pdb') result = [] #empty list for storing results processed_chains = [] for chain in pdb.get_chains(): #iterating per each chain chainId = chain.get_id() #id if chainId not in processed_chains: processed_chains.append(chainId) residuesObj = chain.get_unpacked_list() #residues chainSeq ='' #getting AA sequence for this chain #getting AA sequence for this chain for res in residuesObj: if res.get_resname() in aaCodes.keys(): chainSeq = chainSeq + aaCodes[res.get_resname()] #getting uniprotid for this chain rcsb_url = 'http://www.rcsb.org/pdb/rest/customReport?pdbids=' + pdbId + '.' + chainId + '&customReportColumns=chainId,db_id,db_name&service=wsdisplay&format=text' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' } r = requests.get(rcsb_url, headers=headers) e = ET.ElementTree(ET.fromstring(r.content)) root = e.getroot() uniprotId = '' for elem in root.getiterator('dimEntity.db_id'): uniprotId = elem.text #UniprotId found if uniprotId == '': #this should never happen print 'UniprotId not found !' sys.exit() #translating uniprotId to Ensembl transcripts url = 'http://www.uniprot.org/mapping/' paramsStep2 = { 'from':'ACC', 'to':'ENSEMBL_TRS_ID', 'format':'tab', 'query':uniprotId } data = urllib.urlencode(paramsStep2) request = urllib2.Request(url, data) user = '******' request.add_header('User-Agent', user ) try: response = urllib2.urlopen(request) except urllib2.HTTPError, err: if err.code == 503: response = urllib2.urlopen(request) page = response.read() lines = page.split('\n') transcriptIds = [] for line in lines: #building list of transcripts for uniprotId if len(line.split('\t')[0]) > 0 and (line.split('\t')[0]) <> 'From': transcriptIds.append(line.split('\t')[1]) #checking Ensembl reanscriptIds to find one with sequence that is the best match for processed chain client = ensembl.EnsemblRestClient() best_score = 0 best_match_tid = '' best_match_seq = '' best_alignment = '' for transcriptId in transcriptIds: ref = client.get_sequence( transcriptId , type="protein") sequence = ref['seq'] alns = pairwise2.align.globalxs(chainSeq, sequence, -5, 0) if alns[0][2] > best_score: best_score = alns[0][2] best_match_tid = transcriptId best_match_seq = sequence best_alignment = alns[0][0] result.append([pdbId + '.' + chainId, best_match_tid, best_alignment])