def _parse_variant_feature(feature): variant = { 'pos': feature[1], 'pos_stop': feature[2], 'desc': feature[3], 'variant_id': feature[4], } variant_url = 'http://web.expasy.org/variant_pages/{}.html' variant['variant_url'] = variant_url.format(variant['variant_id']) regex = r'(?P<old_aa>[A-Z]+) -> (?P<new_aa>[A-Z]+)' match = re.search(regex, variant['desc']) if match: variant['old_aa'], variant['new_aa'] = match.groups() variant['prot_change'] = 'p.{}{}{}'.format(seq3(variant['old_aa']), variant['pos'], seq3(variant['new_aa'])) variant['pmids'] = re.findall(r'PubMed:(\d+)', variant['desc']) matches = re.search(r'dbSNP:(rs\d+)', variant['desc']) if matches: assert len(matches.groups()) == 1 variant['rsid'] = matches.group(1) match = re.search(r'\((.+)\)', variant['desc']) if match: review = re.sub(r'; dbSNP:rs\d+', '', match.group(1)) variant['review'] = review return variant
def test_seq1_seq3(self): s3 = "MetAlaTyrtrpcysthrLYSLEUILEGlYPrOGlNaSnaLapRoTyRLySSeRHisTrpLysThr" s1 = "MAYWCTKLIGPQNAPYKSHWKT" self.assertEqual(seq1(s3), s1) self.assertEqual(seq3(s1).upper(), s3.upper()) self.assertEqual(seq1(seq3(s1)), s1) self.assertEqual(seq3(seq1(s3)).upper(), s3.upper())
def residues(self): rset = set([(at.resn, int(at.resi), True) for at in self.pymol.cmd.get_model(self.name).atom]) r = sorted(rset, key=lambda x: x[1]) s = ''.join( self.pymol.cmd.get_fastastr( f'{self.name} and chain {self.chain} and not hetatm').split( '\n')[1:]) first_stated_resn = seq3(s[0]).upper() first_structured_resi = r[0][1] first_structured_resn = r[0][0] if first_structured_resi == 1: if first_stated_resn == first_structured_resn: # case 1. all is good. pass else: # case 2. negative numbers??! raise NotImplementedError( f'First residues are {r[0:3]} but the sequence is {s}') elif first_structured_resi > 1: if first_stated_resn == first_structured_resn: # case 3. The first residue exist is not 1. pass elif seq3(s[first_structured_resi - 1]).upper() == first_structured_resn: # case 4. the first stated residue is 1, but does not exist. r = [(seq3(s[i]).upper(), i + 1, False) for i in range(1, first_structured_resi)] + r else: raise NotImplementedError( f'First residues are {r[0:3]} but the sequence is {s}') else: raise NotImplementedError( f'First residues are {r[0:3]} but the sequence is {s}') return r
def out_of_frame_description(s1, s2): """ Give the description of an out of frame difference between two proteins. Give a description of an inframe difference of two proteins. Also give the position at which the proteins start to differ and the end positions (to be compatible with the in_frame_description function). >>> out_of_frame_description('MTAPQQMT*', 'MTAQQMT*') ('p.(Pro4Glnfs*5)', 3, 9, 8) >>> out_of_frame_description('MTAPQQMT*', 'MTAQMT*') ('p.(Pro4Glnfs*4)', 3, 9, 7) >>> out_of_frame_description('MTAPQQT*', 'MTAQQMT*') ('p.(Pro4Glnfs*5)', 3, 8, 8) >>> out_of_frame_description('MTAPQQT*', 'MTAQQMT') ('p.(Pro4Glnfs*?)', 3, 8, 7) @arg s1: The original protein. @type s1: unicode @arg s2: The mutated protein. @type s2: unicode @return: A tuple of: - unicode ; Protein description of the change. - int ; First position of the change. - int ; Last position of the first protein. - int ; Last position of the second protein. @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). """ s1_seq = s1.rstrip("*") s2_seq = s2.rstrip("*") lcp = len(longest_common_prefix(s1_seq, s2_seq)) if lcp == len(s2_seq): # NonSense mutation. if lcp == len(s1_seq): # Is this correct? return ("p.(=)", 0, 0, 0) return ("p.(%s%i*)" % (seq3(s1[lcp]), lcp + 1), lcp, len(s1), lcp) if lcp == len(s1_seq): # http://www.hgvs.org/mutnomen/FAQ.html#nostop stop = str(abs(len(s1_seq) - len(s2_seq))) if "*" in s2 else "?" return ( "p.(*%i%sext*%s)" % (len(s1_seq) + 1, seq3(s2[len(s1_seq)]), stop), len(s1_seq), len(s1), len(s2), ) # http://www.hgvs.org/mutnomen/FAQ.html#nostop stop = str(len(s2_seq) - lcp + 1) if "*" in s2 else "?" return ( "p.(%s%i%sfs*%s)" % (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), stop), lcp, len(s1), len(s2), )
def out_of_frame_description(s1, s2): """ Give the description of an out of frame difference between two proteins. Give a description of an inframe difference of two proteins. Also give the position at which the proteins start to differ and the end positions (to be compatible with the in_frame_description function). >>> out_of_frame_description('MTAPQQMT*', 'MTAQQMT*') ('p.(Pro4Glnfs*5)', 3, 9, 8) >>> out_of_frame_description('MTAPQQMT*', 'MTAQMT*') ('p.(Pro4Glnfs*4)', 3, 9, 7) >>> out_of_frame_description('MTAPQQT*', 'MTAQQMT*') ('p.(Pro4Glnfs*5)', 3, 8, 8) >>> out_of_frame_description('MTAPQQT*', 'MTAQQMT') ('p.(Pro4Glnfs*?)', 3, 8, 7) @arg s1: The original protein. @type s1: unicode @arg s2: The mutated protein. @type s2: unicode @return: A tuple of: - unicode ; Protein description of the change. - int ; First position of the change. - int ; Last position of the first protein. - int ; Last position of the second protein. @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). """ s1_seq = s1.rstrip('*') s2_seq = s2.rstrip('*') lcp = len(longest_common_prefix(s1_seq, s2_seq)) if lcp == len(s2_seq): # NonSense mutation. if lcp == len(s1_seq): # Is this correct? return ('p.(=)', 0, 0, 0) return ('p.(%s%i*)' % (seq3(s1[lcp]), lcp + 1), lcp, len(s1), lcp) if lcp == len(s1_seq): # http://www.hgvs.org/mutnomen/FAQ.html#nostop stop = unicode(abs(len(s1_seq) - len(s2_seq))) if '*' in s2 else '?' return ('p.(*%i%sext*%s)' % \ (len(s1_seq) + 1, seq3(s2[len(s1_seq)]), stop), len(s1_seq), len(s1), len(s2)) # http://www.hgvs.org/mutnomen/FAQ.html#nostop stop = unicode(len(s2_seq) - lcp + 1) if '*' in s2 else '?' return ('p.(%s%i%sfs*%s)' % \ (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), stop), lcp, len(s1), len(s2))
def process_aa_position(self, bdb, gene, r, ra, record, genepos): ref_ann = str(record.seq.translate())[r.AApos - 1] if r.AAref != ref_ann: self.stderr.write(( "Reference AA does not match the current annotation '%s' != '%s' " % (ref_ann, r.AAref)) + json.dumps(r.to_dict())) return None if genepos: mut = record.seq[:genepos - 1] + Seq( r.ALT) + record.seq[genepos - 2 + len(r.ALT):] alt_ann = str(mut.translate())[r.AApos - 1] if r.AAalt == "fs" and (len(r.ALT) - 1) % 3 != 0: self.stderr.write(("Invalid FS" + json.dumps(r.to_dict()))) return None if (r.AAalt != "fs") and (alt_ann != r.AAalt): self.stderr.write(( "Alternative AA does not match the nucleotide mutation effect '%s' != '%s' " % (ref_ann, r.AAref)) + json.dumps(r.to_dict())) return None effect_query = Effect.objects.filter(transcript=r.LocusTag, gene=gene, ref_organism=bdb, aa_pos=r.AApos, aa_ref=r.AAref, aa_alt=r.AAalt) # hgvs_p="%s%i%s" % (r.AAref, r.AApos, r.AAalt)) if not effect_query.exists(): variant_type = 'frameshift_variant' if "fs" == r.AAalt else ( 'stop_gained' if "STOP" == r.AAalt else "missense_variant" if r.AAref != r.AAalt else 'synonymous_variant') new_effect = Effect(transcript=r.LocusTag, ref_organism=bdb, variant_type=variant_type, gene=gene) new_effect.aa_pos = r.AApos new_effect.aa_ref = r.AAref new_effect.aa_alt = "*" if r.AAalt in ["fs", "STOP"] else r.AAalt new_effect.hgvs_p = ( (seq3(new_effect.aa_ref) if new_effect.aa_ref != "*" else "*") + str(new_effect.aa_pos) + (seq3(new_effect.aa_alt) if new_effect.aa_alt != "*" else "*")) new_effect.save() else: new_effect = effect_query.get() ra.effect = new_effect ra.save() return new_effect
def new_cc(sequences, coords): n_cc_helices = len(sequences) seq_len = int( min(len(sequences[0]) * 5, coords.shape[0] / n_cc_helices) / 5) new_model = PDB.Model.Model(0) segid = ' ' atomname = ['N', 'CA', 'C', 'O', 'CB'] bfactor = 30 occupancy = 1 altloc = ' ' fullname = [' N ', ' CA ', ' C ', ' O ', ' CB '] element = ['N', 'C', 'C', 'O', 'C'] serial_number = 1 chain_id_base = ord('A') for chain_i, sequence in enumerate(sequences): chain_id = chr(chain_id_base + chain_i) new_chain = PDB.Chain.Chain(chain_id) for res_i in range(1, seq_len + 1): res_id = (' ', res_i, ' ') resname = seq3(sequence[res_i - 1]).upper() new_res = PDB.Residue.Residue(res_id, resname, segid) for atom_i in range(5): new_atom = PDB.Atom.Atom(atomname[atom_i], coords[serial_number - 1], bfactor, occupancy, altloc, fullname[atom_i], serial_number, element[atom_i]) new_res.add(new_atom) serial_number += 1 new_chain.add(new_res) new_model.add(new_chain) return new_model
def mutate_codon(codon_in, codon_use_table): """Select a synonymous codon in accordance with the frequency of use in the host organism. Args: codon_in (Bio.Seq.Seq): A single codon. codon_use_table (dict{str, list[list, list]}): A dictionary with each amino acid three-letter code as keys, and a list of two lists as values. The first list is the synonymous codons that encode the amino acid, the second is the frequency with which each synonymous codon is used. Returns: Bio.Seq.Seq: A new codon. """ AA = seq3( CodonTable.standard_dna_table.forward_table[str(codon_in)]).upper() synonymous_codons, codon_use_freq = codon_use_table[AA] if len(synonymous_codons) == 1: return codon_in # pick new codon codon_out = codon_in while codon_in == codon_out: codon_out = random.choices(synonymous_codons, codon_use_freq).pop() logger.detail("Mutating {} codon from {} to {}".format( AA, codon_in, codon_out)) return codon_out
def get_protein(): ''' Translates DNA sequence entered by user. Parameters __________ No parameters only global variable sequence_text required. Return ______ Separate showinfo box with converted amino acid sequence.''' str_sequence_text = str(sequence_text.get()) if (((len(str_sequence_text)) % 3) == 0) and (len(str_sequence_text) != 0): dna = Seq(str_sequence_text, IUPAC.unambiguous_dna) mrna = dna.transcribe() protein = mrna.translate() three_letter_aa = str(seq3(protein)) new_dashed_3_letter_aa = "" for index, letter in enumerate(three_letter_aa): if (index % 3 == 0) and (index != 0): new_dashed_3_letter_aa += "-" new_dashed_3_letter_aa += letter messagebox.showinfo( "!", "Final Converted Protein is: " + new_dashed_3_letter_aa) else: messagebox.showerror( "!", "Sequence needs to be a multiple of 3 or sequence should not be 0." )
def back_translate(self): """Return the DNA sequence from an amino acid sequence by creating a new Seq object. >>> from Bio.Seq import Seq >>> from Bio.Alphabet import IUPAC >>> my_protein = Seq("MAIVMGR", IUPAC.protein) >>> my_protein Seq('MAIVMGR', IUPACProtein()) >>> my_protein.back_translate() Seq('ATGGCCATTGTAATGGGCCGCTG', IUPACUnambiguousDNA()) Trying to back-transcribe a DNA or RNA sequence raises an exception: >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUG", IUPAC.unambiguous_rna) >>> messenger_rna.back_translate() Traceback (most recent call last): ... ValueError: Nucleic acids cannot be back translated! """ base = Bio.Alphabet._get_base_alphabet(self.alphabet) if not isinstance(base, Bio.Alphabet.ProteinAlphabet): raise ValueError("Nucleic acids cannot be back translated!") # right now this just uses the most-prevalent codon for each AA # TODO: select codons with a weighted average using random.choice return Seq( "".join([ CodonUsage.SynonymousCodons[seq3(AA).upper()][0] for AA in str(self) ]), IUPAC.unambiguous_dna, )
def back_translate(self): """Return the DNA sequence from an amino acid sequence by creating a new Seq object. The first codon in the synonymous codons list is always chosen for each amino acid; codon optimization is required after back translation. >>> from Bio.Seq import Seq >>> from Bio.Alphabet import IUPAC >>> my_protein = Seq("MAIVMGR", IUPAC.protein) >>> my_protein Seq('MAIVMGR', IUPACProtein()) >>> my_protein.back_translate() Seq('ATGGCCATTGTAATGGGCCGCTG', IUPACUnambiguousDNA()) Trying to back-transcribe a DNA or RNA sequence raises an exception: >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUG", IUPAC.unambiguous_rna) >>> messenger_rna.back_translate() Traceback (most recent call last): ... ValueError: Nucleic acids cannot be back translated! """ base = Bio.Alphabet._get_base_alphabet(self.alphabet) if not isinstance(base, Bio.Alphabet.ProteinAlphabet): raise ValueError("Nucleic acids cannot be back translated!") # always use the first codon in the synonymous codons list for each AA return Seq( "".join([ CodonUsage.SynonymousCodons[seq3(AA).upper()][0] for AA in str(self).upper() ]), IUPAC.unambiguous_dna, )
def show(seq, start=0, stop=None, width=None, peprep=3): """ Affiche une représentation d'une séquence nucléotique ou peptidique avec règle graduée arguments: seq: Seq ou SeqRecord à représenter start et stop: limite de la représentation. Les arguments start et stop doivent être des entiers (int); width: largeur de la représentation. width doit être un multiple de 10. peprep: 1 ou 3: représentation des acides aminés. Par défaut, la représentation des seq peptidique (peprep) et de 3 lettres/aa """ if testalpha(seq) == generic_nucleotide or testalpha( seq) == IUPAC.unambiguous_dna or testalpha( seq) == IUPAC.unambiguous_rna: return shown(seq, start=start, stop=stop, width=width) #elif testalpha(seq) == IUPAC.protein or testalpha(seq) == IUPAC.extended_protein: elif langage(seq) == 1 or langage(seq) == 3: if peprep == 3 and langage(seq) == 1: seq = toSeq(seq) seq = seq3(seq) return showp(seq, start=start, stop=stop, width=width)
def write_pdb(self, pdb_path): """ Write a pdb file by threading the query sequence on the template CA coordinates. Args: pdb_path (str): Path of the pdb file to create. """ with open(pdb_path, "w") as file: # Extra informations on the template used to generate the pdb file file.write( "REMARK Threading of query sequence on the {:s} template #{:d}.\n" .format(self.template.name, self.num)) ind = 0 count_atom = 1 for count_res in range(self.query.first, self.query.last + 1): res_t = self.template.residues[ind] res_q = self.query.residues[ind] if res_q.name == "-" or res_t.name == "-": ind += 1 continue # # N "ATOM" line file.write( "{:6s}{:5d} {:^4s} {:>3s}{:>2s}{:4d}{:>12.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f}{:>12s}\n" .format("ATOM", count_atom, "N", seq3(res_q.name).upper(), "A", count_res, res_t.n_atom.coords[0], res_t.n_atom.coords[1], res_t.n_atom.coords[2], 1.00, 0, "N")) count_atom += 1 # CA "ATOM" line file.write( "{:6s}{:5d} {:^4s} {:>3s}{:>2s}{:4d}{:>12.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f}{:>12s}\n" .format("ATOM", count_atom, "CA", seq3(res_q.name).upper(), "A", count_res, res_t.ca_atom.coords[0], res_t.ca_atom.coords[1], res_t.ca_atom.coords[2], 1.00, 0, "C")) count_atom += 1 # C "ATOM" line file.write( "{:6s}{:5d} {:^4s} {:>3s}{:>2s}{:4d}{:>12.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f}{:>12s}\n" .format("ATOM", count_atom, "C", seq3(res_q.name).upper(), "A", count_res, res_t.c_atom.coords[0], res_t.c_atom.coords[1], res_t.c_atom.coords[2], 1.00, 0, "C")) count_atom += 1 ind += 1 # The two last lines of the created pdb file ("END" and "TER" lines) file.write("END\n")
def prepare_output(AAs, count_enriched_AAs): """organize output such that amino acids are grouped by category""" output = [] for AA in AAs: AA_3 = (seq3(AA)).upper() if AA_3 in count_enriched_AAs: output.append(count_enriched_AAs[AA_3]) else: output.append(0) return output
def PrintMutationPair(ccd_pair): fileout = "mut_tmp" fileobj = open(fileout, "w") for pair in ccd_pair: uid = pair[0] seq = pair[1] idx = 1 #if uid == "Q9HAN9": # print seq for nu3 in [seq[i:(i+3)] for i in xrange(0, len(seq), 3)]: #print nu3 orig_aa, mutate_aa = GenerateAllPossibleMutation(nu3) if orig_aa == "*": continue for each_mutate in mutate_aa: content = [uid, idx, nu3, seq3(orig_aa).upper(), seq3(each_mutate).upper()] line = "\t".join(map(str, content)) fileobj.write(line + "\n") idx = idx + 1 fileobj.close()
def out_of_frame_description(s1, s2): """ Give the description of an out of frame difference between two proteins. Give a description of an inframe difference of two proteins. Also give the position at which the proteins start to differ and the end positions (to be compatible with the in_frame_description function). >>> out_of_frame_description('MTAPQQMT', 'MTAQQMT') ('p.(Pro4Glnfs*5)', 3, 8, 7) >>> out_of_frame_description('MTAPQQMT', 'MTAQMT') ('p.(Pro4Glnfs*4)', 3, 8, 6) >>> out_of_frame_description('MTAPQQT', 'MTAQQMT') ('p.(Pro4Glnfs*5)', 3, 7, 7) @arg s1: The original protein. @type s1: unicode @arg s2: The mutated protein. @type s2: unicode @return: A tuple of: - unicode ; Protein description of the change. - int ; First position of the change. - int ; Last position of the first protein. - int ; Last position of the second protein. @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). """ lcp = len(longest_common_prefix(s1, s2)) if lcp == len(s2): # NonSense mutation. if lcp == len(s1): # Is this correct? return ('p.(=)', 0, 0, 0) return ('p.(%s%i*)' % (seq3(s1[lcp]), lcp + 1), lcp, len(s1), lcp) if lcp == len(s1): return ('p.(*%i%sext*%i)' % \ (len(s1) + 1, seq3(s2[len(s1)]), abs(len(s1) - len(s2))), len(s1), len(s1), len(s2)) return ('p.(%s%i%sfs*%i)' % \ (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), len(s2) - lcp + 1), lcp, len(s1), len(s2))
def out_of_frame_description(s1, s2): """ Give the description of an out of frame difference between two proteins. Give a description of an inframe difference of two proteins. Also give the position at which the proteins start to differ and the end positions (to be compatible with the in_frame_description function). >>> out_of_frame_description('MTAPQQMT', 'MTAQQMT') ('p.(Pro4Glnfs*5)', 3, 8, 7) >>> out_of_frame_description('MTAPQQMT', 'MTAQMT') ('p.(Pro4Glnfs*4)', 3, 8, 6) >>> out_of_frame_description('MTAPQQT', 'MTAQQMT') ('p.(Pro4Glnfs*5)', 3, 7, 7) @arg s1: The original protein. @type s1: unicode @arg s2: The mutated protein. @type s2: unicode @return: A tuple of: - unicode ; Protein description of the change. - int ; First position of the change. - int ; Last position of the first protein. - int ; Last position of the second protein. @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). """ lcp = len(longest_common_prefix(s1, s2)) if lcp == len(s2): # NonSense mutation. if lcp == len(s1): # Is this correct? return ('p.(=)', 0, 0, 0) return ('p.(%s%i*)' % (seq3(s1[lcp]), lcp + 1), lcp, len(s1), lcp) if lcp == len(s1) : return ('p.(*%i%sext*%i)' % \ (len(s1) + 1, seq3(s2[len(s1)]), abs(len(s1) - len(s2))), len(s1), len(s1), len(s2)) return ('p.(%s%i%sfs*%i)' % \ (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), len(s2) - lcp + 1), lcp, len(s1), len(s2))
def labeling(self, label): if label == None: if len(self.amino_acids) == 1: return seq3(self.amino_acids[0]) + str(self.position + 1) else: self.amino_acids = sorted(self.amino_acids) sets = [] for x in self.sthereochemistry: if 'Similar' in x: sets.append(x.split('(')[0]) else: sets.append(x) sets = sorted(list(set(sets))) if len(sets) > 1: feature = ', '.join(sets) else: feature = sets[0] return '%s (%s or %s) %d' % ( feature, ', '.join(map(seq3, self.amino_acids[:-1])), seq3(self.amino_acids[-1]), self.position + 1) return label
def _generate_template(self): bbnames = ['C', 'O', 'CA', 'N', 'OXT'] lgt = len(self.pep_seq) if self.custom_template is None: tpl = _TEMPLATES_DIR / str(lgt) + 'mer.pdb' else: tpl = self.custom_template tpl = prody.parsePDB(tpl) for r, newname in zip(tpl.iterResidues(), self.pep_seq): r.setResname(seq3(newname).upper()) tpl = tpl.select('name ' + ' '.join(bbnames)).copy() tpl.setChids('B') self._tpl = tpl
def __proteinDescription(self): """ Give the HGVS description of the raw variant stored in this class. Note that this function relies on the absence of values to make the correct description. Also see the comment in the class definition. @returns: The HGVS description of the raw variant stored in this class. @rtype: unicode """ if self.type == "unknown": return "?" if not self.start: return "=" descr = "" if not self.deleted: if self.type == "ext": descr += '*' else: descr += "%s" % seq3(self.startAA) #if else: descr += "%s" % seq3(self.deleted) descr += "%i" % self.start if self.end: descr += "_%s%i" % (seq3(self.endAA), self.end) if self.type not in ["subst", "stop", "ext", "fs"]: descr += self.type if self.inserted: descr += "%s" % seq3(self.inserted) if self.type == "stop": return descr + '*' if self.term: return descr + "%s*%i" % (self.type, self.term) return descr
def to_string_aminoacids3(item, group_indices='all', check=True): if check: digest_item(item, 'string:aminoacids1') group_indices = digest_group_indices(group_indices) try: from Bio.SeqUtils import seq3 except: raise LibraryNotFoundError('biopython') tmp_item = seq3(item) return tmp_item
def mutate_codon(codon_in, codon_use_table): AA = seq3( CodonTable.standard_dna_table.forward_table[str(codon_in)]).upper() synonymous_codons, codon_use_freq = codon_use_table[AA] if len(synonymous_codons) == 1: return codon_in # pick new codon codon_out = codon_in while codon_in == codon_out: codon_out = random.choices(synonymous_codons, codon_use_freq).pop() logger.detail("mutating [{0}] codon from {1} to {2}".format( AA, codon_in[1], codon_out)) return codon_out
def resample_codons(dna_sequence, codon_use_by_aa): """[summary] Args: dna_sequence ([type]): [description] codon_use_by_aa ([type]): [description] Returns: [type]: [description] """ resampled_dna = "".join([ random.choices(*codon_use_by_aa[seq3(AA).upper()]).pop() for AA in dna_sequence.translate() ]) return Seq(resampled_dna, IUPAC.unambiguous_dna)
def get_protein(): str_sequence_text = str(sequence_text.get()) if (((len(str_sequence_text)) % 3) == 0) and (len(str_sequence_text) != 0): dna = Seq(str_sequence_text, IUPAC.unambiguous_dna) mrna = dna.transcribe() protein = mrna.translate() three_letter_aa = str(seq3(protein)) new_dashed_3_letter_aa = "" for index, letter in enumerate(three_letter_aa): if (index % 3 == 0) and (index != 0): new_dashed_3_letter_aa += "-" new_dashed_3_letter_aa += letter messagebox.showinfo( "!", "Final Converted Protein is: " + new_dashed_3_letter_aa) else: messagebox.showerror( "!", "Sequence needs to be a multiple of 3 or sequence should not be 0." )
def get_percentage_aa(value): traces = [] file_path = choose_fasta(value) with open(file_path, "r") as file_fasta: for entry2 in SeqIO.parse(file_fasta, "fasta"): id_prot = entry2.id.split("|") id_chain = id_prot[0].split(":") seq = str(entry2.seq) X = pp.ProteinAnalysis(seq) percent_aa = X.get_amino_acids_percent() aa_list = [] aa_percent = [] for key, value in percent_aa.items(): aa_name = seq3(key) aa_list.append(aa_name) aa_percent.append(value * 100) traces.append( go.Bar(x=aa_list, y=aa_percent, name="Chain " + id_chain[1])) return traces
def mutate_codon(codon_in, codon_use_table): """Select a synonymous codon in accordance with the frequency of use in the host organism. Args: codon_in (Bio.Seq.Seq): A single codon. Returns: Bio.Seq.Seq: A new codon. """ amino_acid = seq3( CodonTable.standard_dna_table.forward_table[str(codon_in)]).upper() synonymous_codons, codon_use_freq = codon_use_table[amino_acid] if len(synonymous_codons) == 1: return codon_in # pick new codon codon_out = codon_in while codon_in == codon_out: codon_out = random.choices(synonymous_codons, codon_use_freq).pop() return codon_out
def resample_codons(dna_sequence, codon_use_table): """Generate a new DNA sequence by swapping synonymous codons. Codons are selected in accordance with their frequency of occurrence in the host organism. Args: dna_sequence (Bio.Seq.Seq): A read-only representation of the DNA sequence. codon_use_table (dict{str, list[list, list]}): A dictionary with each amino acid three-letter code as keys, and a list of two lists as values. The first list is the synonymous codons that encode the amino acid, the second is the frequency with which each synonymous codon is used. Returns: Bio.Seq.Seq: A read-only representation of the new DNA sequence. """ resampled_dna = "".join([ random.choices(*codon_use_table[seq3(AA).upper()]).pop() for AA in dna_sequence.translate() ]) return Seq(resampled_dna, IUPAC.unambiguous_dna)
list_of_records = accepted if verbose: print len(list_of_records), 'in the middle' # accepted = [list_of_records.pop()] accepted = [] while len(list_of_records) > 0: r = list_of_records.pop(0) duplicate = False for q in accepted: if r[0:6] == q[0:6]: duplicate = True break if not duplicate: accepted.append(r) list_of_records = sorted(accepted, key=lambda x: float(x[3]), reverse=True) if verbose: print len(list_of_records), 'after filter' with open(seqid + ".gff", 'a') as gff: for rec in list_of_records: strand = rec[5] start = rec[1] end = rec[2] aux = 'product=tRNA-' + seq3( rec[0][0]) + ';anticodon=(' + rec[4].lower( ) + ');anticodon_position=' + str( rec[6]) + ';label=' + rec[0] + ';structure=' + rec[7] linia_output = seqid + '\t' + 'infernal\ttRNA' + '\t' + start + '\t' + end + '\t' + str( rec[3]) + '\t' + strand + '\t.\t' + aux + '\n' if int(start) <= sequence_len: gff.write(linia_output)
#Method 2: Direct Translation aa = dna_seq.translate() print(aa) #Custom stop codon print(mrna.translate(stop_symbol="@")) #Back transcribe to DNA, same as dna_seq print(mrna.back_transcribe()) #Join steps print(dna_seq.transcribe().translate()) #3 letter version of protein three_letter_aa = seq3(str(aa)) print(three_letter_aa) #Back to 1 letter version of protein same as original aa one_letter_aa = seq1(str(three_letter_aa)) print(one_letter_aa) #Methods on Bio.Data print(dir(CodonTable)) #CodonTable for DNA by name print(CodonTable.unambiguous_dna_by_name["Standard"]) #CodonTable for RNA by name print(CodonTable.unambiguous_rna_by_name["Standard"])
def load_cdna_and_polyA(paths,organism,pep_dict,exon_info,failed): organism_out = open("./Data/"+organism+"_polyA.data",'w') cdna = list(SeqIO.parse(open(paths["cdna"],'r'),"fasta")) cdna_size = len(cdna) cdna_counter = 0 next_step = 0 for cd in cdna: cdna_counter += 1 if (float(cdna_counter*100)/cdna_size) >= next_step: next_step += 10 print str(int(float(cdna_counter*100)/cdna_size))+"%" description = dict([z.split(":",1) for z in cd.description.split()[1:] if ":" in z]) cdna_transcript_id = cd.id cdna_gene = description["gene"] gt = (cdna_gene,cdna_transcript_id) if gt in pep_dict: for p in pep_dict[gt]: protein_id = p.id gene_id = str(gt[0]) pep_sequence = str(seq3(p.seq)) cdna_sequence = str(cd.seq) cdna_translated_list = [] cdna_start = 0 cdna_stop = 0 for x in range(3): cdna_translated_list.append(seq3(str(Seq(cdna_sequence[x:]+"N"*(3-len(cdna_sequence[x:])%3)).translate()))) cut_found = [v for v in range(len(cdna_translated_list)) if pep_sequence in cdna_translated_list[v]] #easy if pep_sequence == cdna_translated_list[0]: cdna_start = 0 cdna_stop = len(cdna_sequence) proper_seq = cdna_sequence AAA_list = findPolyA(proper_seq) grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id]) #cutting elif cut_found: for c in cut_found: idx = c+ cdna_translated_list[c].find(pep_sequence) cdna_start = idx cdna_stop = idx+len(pep_sequence) proper_seq = cdna_sequence[idx:(idx+len(pep_sequence))] AAA_list = findPolyA(proper_seq) grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id]) #alignment else: prot_seq = SeqRecord(Seq(seq1(pep_sequence)),id = "prot_seq") y = open(organism+"prot.fasta",'w') SeqIO.write(prot_seq, y, "fasta") y.close() best = [] for i in range(3): cdna_seq = SeqRecord(Seq(cdna_sequence[i:len(cdna_sequence)-((len(cdna_sequence)-i)%3)]).translate(stop_symbol="W"),id="cdna_seq") k = open(organism+"cdna.fasta",'w') SeqIO.write(cdna_seq, k , "fasta") k.close() output = NcbiblastpCommandline(query=organism+"prot.fasta", subject=organism+"cdna.fasta", outfmt=5)()[0] blast_result_records = list(NCBIXML.parse(StringIO(output))) for bl_res in blast_result_records: for z in bl_res.alignments: for h in z.hsps: best.append((h.query,h.sbjct,i,h.sbjct_start, h.query_start,h.score)) if best: l = sorted(best,key=lambda x:x[-1])[-1] proper_seq = cdna_sequence[l[2]+(int(l[3])-1)*3:l[2]+((int(l[3])-1)+len(l[1]))*3] AAA_list = findPolyA(proper_seq) cdna_start = l[2]+(int(l[3])-1)*3 cdna_stop = l[2]+((int(l[3])-1)+len(l[1]))*3 grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id]) else: failed.write(",".join([protein_id,cdna_transcript_id,gene_id,pep_sequence,cdna_sequence])+"\n") os.remove(organism+"cdna.fasta") os.remove(organism+"prot.fasta") cdna = schema.Cdna(transcript_id = cdna_transcript_id, gene_id = cdna_gene, nucleotide_sequence=str(cd.seq),organism_name =organism, cdna_start = cdna_start, cdna_stop =cdna_stop) db.session.add(cdna)
def get_aa_comp(v, alphabet, seq): answer = '' break_and_return = False if v is None or len(v) < 2: pass else: try: subsequence = seq[v[0]:v[1]] except TypeError: answer = html.Table([]) break_and_return = True if not break_and_return: # default - file represents a protein aa_string = subsequence if alphabet == 'dna': # remove partial codons subsequence = subsequence[:-(len(subsequence) % 3)] \ if (len(subsequence) % 3) != 0 \ else subsequence s = Seq(subsequence, generic_dna) try: aa_string = str(s.translate()) except TranslationError: answer = "Sequence does not represent DNA." break_and_return = True elif alphabet == 'rna': subsequence = subsequence[:-(len(subsequence) % 3)] \ if (len(subsequence) % 3) != 0 \ else subsequence s = Seq(subsequence, generic_rna) try: aa_string = str(s.translate()) except TranslationError: answer = "Sequence does not represent RNA." break_and_return = True if not break_and_return: # all unique amino acids amino_acids = list(set(aa_string)) aa_counts = [ { 'aa': seq3(aa), 'count': aa_string.count(aa) } for aa in amino_acids ] # sort by most common AA in sequence aa_counts.sort( key=lambda x: x['count'], reverse=True ) summary = [ html.Tr([html.Td(aac['aa']), html.Td(str(aac['count']))]) for aac in aa_counts] # include explanation for translation if necessary if alphabet in ('dna', 'rna') \ and len(summary) > 0: answer = [ '(Protein translated from {}: {})'.format( alphabet.upper(), aa_string ), html.Table(summary) ] else: answer = html.Table(summary) return answer
def get_all_codons(vcf_rec,fasta_dict,three_letter_abbr=True): """ Takes a vcf and ref_fasta_dictionary, and returns a list of Record objects with codons and aminoacid translations using the standard alphabet. CRITICALLY, this functions assumes that all sequences are inframe. It will work well for CDS and handcurated genes. :param vcf_rec: vcf iterator obj from PyVCF4.1 :param fasta_dict: Dictionary of Bio.Seq objs ident :return: list of modified Record objects that can used write out a file of SNPs """ ret_list = [] for rec in vcf_rec: print(rec.FILTER) if rec.POS % 3 == 0 : """ This indicates snp in the 3rd codon position. We need to extract the 2 previous nucleotides from the fasta as well. 12[3] """ start = rec.POS - 3 stop = rec.POS codon_pos = 3 elif rec.POS % 3 == 2: """ 2nd Codon position 1[2]3 """ start = rec.POS - 2 stop = rec.POS + 1 codon_pos = 2 elif rec.POS % 3 == 1 : """ 1st Codon Position [1]23 """ start = rec.POS - 1 stop = rec.POS + 2 codon_pos = 1 """ get codons and amino acids for mutations """ rec.CODON_POS = codon_pos rec.CODON_START = start rec.CODON_STOP = stop rec.CODON_REF = fasta_dict[rec.CHROM].seq[start:stop] rec.CODON_ALT = [] rec.AMINO_REF = rec.CODON_REF.translate() if three_letter_abbr is True: rec.AMINO_REF = seq3(rec.AMINO_REF) rec.AMINO_ALT = [] for a in rec.ALT : """ ALT alleles come in a list, so alt_codons must follow suit. Here we are able to use codon_pos -1 to adjust for python string slicing, to replace the ref with the ALT """ alt_codon = list(rec.CODON_REF) alt_codon[codon_pos - 1] = a alt_codon_str = [] for nuc in alt_codon: alt_codon_str.append(str(nuc)) del alt_codon # going to create it again as a Seq obj. alt_codon = Seq(''.join(alt_codon_str),) rec.CODON_ALT.append(alt_codon) if three_letter_abbr is True : rec.AMINO_ALT.append(seq3(alt_codon.translate())) else: rec.AMINO_ALT.append(seq3(alt_codon.translate())) #print(''.join(alt_codon_str), rec.CODON_REF, alt_codon.translate(),rec.AMINO_REF) ret_list.append(rec) return ret_list
def in_frame_description(s1, s2): """ Give a description of an inframe difference of two proteins. Also give the position at which the proteins start to differ and the positions at which they are the same again. >>> in_frame_description('MTAPQQMT*', 'MTAQQMT*') ('p.(Pro4del)', 3, 4, 3) >>> in_frame_description('MTAPQQMT*', 'MTAQMT*') ('p.(Pro4_Gln5del)', 3, 5, 3) >>> in_frame_description('MTAPQQT*', 'MTAQQMT*') ('p.(Pro4_Gln6delinsGlnGlnMet)', 3, 6, 6) >>> in_frame_description('MTAPQQMT*', 'MTAPQQMTMQ*') ('p.(*9Metext*2)', 8, 9, 11) >>> in_frame_description('MTAPQQMT*', 'MTAPQQMTMQ') ('p.(*9Metext*?)', 8, 9, 10) @arg s1: The original protein. @type s1: unicode @arg s2: The mutated protein. @type s2: unicode @return: A tuple of: - unicode ; Protein description of the change. - int ; First position of the change. - int ; Last position of the change in the first protein. - int ; Last position of the change in the second protein. @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). @todo: Refactor this code (too many return statements). """ s2_stop = '*' in s2 s1 = s1.rstrip('*') s2 = s2.rstrip('*') if s1 == s2: # Nothing happened. return ('p.(=)', 0, 0, 0) lcp = len(longest_common_prefix(s1, s2)) lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:])) s1_end = len(s1) - lcs s2_end = len(s2) - lcs # Insertion / Duplication / Extention. if not s1_end - lcp: if len(s1) == lcp: # http://www.hgvs.org/mutnomen/FAQ.html#nostop stop = unicode(abs(len(s1) - len(s2))) if s2_stop else '?' return ('p.(*%i%sext*%s)' % \ (len(s1) + 1, seq3(s2[len(s1)]), stop), len(s1), len(s1) + 1, len(s2) + (1 if s2_stop else 0)) ins_length = s2_end - lcp if lcp - ins_length >= 0 and s1[lcp - ins_length:lcp] == s2[lcp:s2_end]: if ins_length == 1: return ('p.(%s%idup)' % \ (seq3(s1[lcp - ins_length]), lcp - ins_length + 1), lcp, lcp, lcp + 1) return ('p.(%s%i_%s%idup)' % \ (seq3(s1[lcp - ins_length]), lcp - ins_length + 1, seq3(s1[lcp - 1]), lcp), lcp, lcp, lcp + ins_length) #if return ('p.(%s%i_%s%iins%s)' % \ (seq3(s1[lcp - 1]), lcp, seq3(s1[lcp]), lcp + 1, seq3(s2[lcp:s2_end])), lcp, lcp, s2_end) #if # Deletion / Inframe stop. if not s2_end - lcp: if len(s2) == lcp: return ('p.(%s%i*)' % (seq3(s1[len(s2)]), len(s2) + 1), lcp, len(s1) + 1, len(s2) + 1) if lcp + 1 == s1_end: return ('p.(%s%idel)' % (seq3(s1[lcp]), lcp + 1), lcp, lcp + 1, lcp) return ('p.(%s%i_%s%idel)' % \ (seq3(s1[lcp]), lcp + 1, seq3(s1[s1_end - 1]), s1_end), lcp, s1_end, lcp) #if # Substitution. if s1_end == s2_end and s1_end == lcp + 1: return ('p.(%s%i%s)' % (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp])), lcp, lcp + 1, lcp + 1) # InDel. if lcp + 1 == s1_end: return ('p.(%s%idelins%s)' % \ (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp:s2_end])), lcp, lcp + 1, s2_end) return ('p.(%s%i_%s%idelins%s)' % \ (seq3(s1[lcp]), lcp + 1, seq3(s1[s1_end - 1]), s1_end, seq3(s2[lcp:s2_end])), lcp, s1_end, s2_end)
def get_3letter_aa(one_letter_aa): return seq3(one_letter_aa, custom_map={"*": "***"}, undef_code='---')
file.write( json.dumps(family)) # Writes dictionary to file for safe keeping. # file.close() with open( '%s_family_residues.txt' % (args.family), 'w' ) as g: # Writing catalytic residue information for later in three-letter-code. # g.write('ID Res1 Pos1 Res2 Pos2 Res3 Pos3\n') for i in family: if i in template: pass else: g.write(i) for j in family[i]: residue = j three = seq3( residue[0:1]) # Converts one- to three-letter-code. # position = str(residue[1:]) three_res = str("%s %s" % (three, position)) g.write(" %s" % (three_res.upper())) g.write('\n') g.close() os.chdir('../templates/') #########################PART ii: Catalytic constraints generator############################# template = [] residue1 = [] residue2 = [] pairing = [] CA1CA2 = [] CB1CB2 = []