def scoreAlignmentList(alignmentList): scoreList = [] queriesWithStops = [] minScore = 0 for al in alignmentList: stopCodons = 0 queryScore = 0 alN = gaplessStrings(al) ref = alN[0] query = alN[1] refAA = translate(ref) #print(refAA) queryAA = translate(query) #print(queryAA) for c in refAA: if c == '*': stopCodons += 1 for c in queryAA: if c == '*': stopCodons += 1 if stopCodons == 0: queryScore = sum(score_pairwise(refAA, queryAA, blosum62, -5, -1)) #print(queryScore) scoreList.append(queryScore) if minScore > queryScore: minScore = queryScore else: queriesWithStops.append(stopCodons) #print("stopCodon") #print("") for stopCodonCount in queriesWithStops: scoreList.append(stopCodonCount * minScore) return scoreList
def break_up_frame(s): """Returns offset, nuc, protein.""" start = 0 for match in re_stops.finditer(s): index = match.start() + 3 if index % 3 != 0: continue n = s[start:index] if options.ftype == "CDS": offset, n, t = start_chop_and_trans(n) else: offset = 0 t = translate(n, options.table, to_stop=True) if n and len(t) >= options.min_len: yield start + offset, n, t start = index if options.ends == "open": # No stop codon, Biopython's strict CDS translate will fail n = s[start:] # Ensure we have whole codons # TODO - Try appending N instead? # TODO - Do the next four lines more elegantly if len(n) % 3: n = n[:-1] if len(n) % 3: n = n[:-1] if options.ftype == "CDS": offset, n, t = start_chop_and_trans(n, strict=False) else: offset = 0 t = translate(n, options.table, to_stop=True) if n and len(t) >= options.min_len: yield start + offset, n, t
def get_frame(geneseq, gene_HXB2, genename, VERBOSE=0): '''Get the frame by aligning the proteins''' from seqanpy import align_local from Bio.Seq import translate from numpy import argmax geneseq = ''.join(geneseq) gene_HXB2 = ''.join(gene_HXB2) if genename in ('tat1', 'rev1'): gene_HXB2 = gene_HXB2[:len(gene_HXB2) - (len(gene_HXB2) % 3)] elif genename in ('tat2', 'rev2'): gene_HXB2 = gene_HXB2[len(gene_HXB2) % 3:] prot_HXB2 = translate(gene_HXB2) scores = [] for frame in xrange(3): tmp = geneseq[frame:] tmp = tmp[:len(tmp) - (len(tmp) % 3)] tmp = translate(tmp) (score, ali1, ali2) = align_local(prot_HXB2, tmp) scores.append(score) return argmax(scores)
def test(dna,AA,codeline,iAA,fAA,cds,strand): firstBreak = codeline.find(',') secondBreak = codeline.find(',',firstBreak+1) thirdBreak = codeline.find(',',secondBreak+1) fourthBreak = codeline.rfind(',') codonStartSite = int(codeline[firstBreak+1:secondBreak]) initialCodon = codeline[secondBreak+1:thirdBreak] codonEndSite = int(codeline[thirdBreak+1:fourthBreak]) finalCodon = codeline[fourthBreak+1:] TranslatableInitialCodon = initialCodon TranslatableFinalCodon = finalCodon if strand ==-1: TranslatableInitialCodon = reverse_complement(initialCodon) TranslatableFinalCodon = reverse_complement(finalCodon) #TEST CASES if AA != cds.qualifiers['translation'][0]: #protein seqs match up print "AA seqs not equal" return False elif dna[codonStartSite-1:codonEndSite] != initialCodon: #codon that is being modified is where its supposed to be print dna[codonStartSite-1:codonEndSite] +'!=' + initialCodon + " :so codeline doesnt match up" return False elif translate(TranslatableInitialCodon) != iAA: #starting codon is what its supposed to be return False elif translate(TranslatableFinalCodon) != fAA: #final codon is what its supposed to be return False else: return True
def six_frame_translations(seq, genetic_code=1): """Formatted string showing the 6 frame translations and GC content. nice looking 6 frame translation with GC content - code from xbbtools similar to DNA Striders six-frame translation e.g. from Bio.SeqUtils import six_frame_translations print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") """ from Bio.Seq import reverse_complement, translate anti = reverse_complement(seq) comp = anti[::-1] length = len(seq) frames = {} for i in range(0, 3): frames[i + 1] = translate(seq[i:], genetic_code) frames[-(i + 1)] = reverse(translate(anti[i:], genetic_code)) # create header if length > 20: short = "%s ... %s" % (seq[:10], seq[-10:]) else: short = seq # TODO? Remove the date as this would spoil any unit test... date = time.strftime("%y %b %d, %X", time.localtime(time.time())) header = "GC_Frame: %s, " % date for nt in ["a", "t", "g", "c"]: header += "%s:%d " % (nt, seq.count(nt.upper())) header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % (short.lower(), length, GC(seq)) res = header for i in range(0, length, 60): subseq = seq[i : i + 60] csubseq = comp[i : i + 60] p = i / 3 res = res + "%d/%d\n" % (i + 1, i / 3 + 1) res = res + " " + " ".join(map(None, frames[3][p : p + 20])) + "\n" res = res + " " + " ".join(map(None, frames[2][p : p + 20])) + "\n" res = res + " ".join(map(None, frames[1][p : p + 20])) + "\n" # seq res = res + subseq.lower() + "%5d %%\n" % int(GC(subseq)) res = res + csubseq.lower() + "\n" # - frames res = res + " ".join(map(None, frames[-2][p : p + 20])) + " \n" res = res + " " + " ".join(map(None, frames[-1][p : p + 20])) + "\n" res = res + " " + " ".join(map(None, frames[-3][p : p + 20])) + "\n\n" return res
def check_translation(sequence, translation, table=None) : if table is None : #Seq method: if translation != str(sequence.translate()) \ or translation != str(translate(sequence)) \ or translation != translate(str(sequence)) : raise ValueError("%s -> %s" % (sequence, translation)) else: if translation != str(sequence.translate(table)) \ or translation != str(translate(sequence,table)) \ or translation != translate(str(sequence),table) : raise ValueError("%s -> %s (table %s)" \ % (sequence, translation, table)) return True
def get_our_costs_at_Rihn(costs_Rihn, costs_ours, data, aa_mutation_rates): '''Get our costs at their positions''' c = costs_ours.set_index('pos').loc[costs_Rihn['pos']]['median'] costs_Rihn_by_pos = costs_Rihn.set_index('pos') c_float = [] c_IQD_target_specfic = [] for (p, mut), ci in zip(costs_Rihn_by_pos.iterrows(), c): if ci == '<0.001': ci = 0 elif ci == '>0.1': ci = 1 else: ci = float(ci) c_float.append(ci) cons, ipos, target_aa = mut['mut'][0], int(mut['mut'][1:-1]), mut['mut'][-1] ipos +=714 print(cons, mut['NL4-3'], translate(data['init_codon']['pol']['p2'][ipos])) c_IQD_target_specfic.append(fitness_cost_mutation('pol', data, aa_mutation_rates, ipos, target_aa, nbootstraps=100)) c[:] = c_float c_IQD_target_specfic = np.array(c_IQD_target_specfic) comp = (pd.concat([c, costs_Rihn.set_index('pos')['cost']], axis=1) .rename(columns={'median': 'ours', 'cost': 'Rihn'})) return comp, c_IQD_target_specfic
def find_frame(read): """Frame is the one with the smallest number of stop codons """ from Bio.Seq import translate import Bio # use this to cut read at multiple of three length rem = len(read) % 3 last_pos = rem if rem else None try: read = read[:-last_pos] except TypeError: pass assert len(read) % 3 == 0, read read_len = len(read) - 3 try: counts = [(translate(read[f : read_len + f]).count("*"), f + 1) for f in range(3)] except Bio.Data.CodonTable.TranslationError: counts = [(gap_translation(read[f:]).count("*"), f + 1) for f in range(3)] sor_cnt = sorted(counts) stop_codons, frame = sor_cnt[0] if stop_codons > 0: warnings.warn("The sequence %s contains %dstop codons" % (read, stop_codons)) if sor_cnt[1][0] == 0: warnings.warn("Two frames are possible! %d and %d" % (frame, sor_cnt[1][1])) return frame
def PTRA(DNA,protein): index = [] for i in [1,2,3,4,5,6,9,10,11,12,13,14,15]: p = translate(DNA, table=i, stop_symbol='*', to_stop=True) if p == protein: index.append(i) return index
def get_local_codon(self, mutations, pos, mut=None): codon_pos = pos%3 codon_seq = mutations[pos - codon_pos:pos + 3 - codon_pos] codon_seq = ''.join(map(lambda _x: _x['ref'], codon_seq)) if mut: codon_seq = codon_seq[:codon_pos]+ mut + codon_seq[codon_pos + 1:] return translate(codon_seq)
def translate_with_gaps(seq): '''Translate sequence with gaps''' from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq, translate from Bio.Alphabet.IUPAC import protein L = len(seq) if L % 3: raise ValueError('The sequence length is not a multiple of 3') seqstr = ''.join(seq) prot = [] for i in xrange(L // 3): codon = seqstr[3 * i: 3 * (i+1)] if codon == '---': prot.append('-') elif '-' in codon: raise ValueError('Non-aligned gaps found') else: prot.append(''.join(translate(codon))) prot = ''.join(prot) # Output in various formats if isinstance(seq, basestring): return prot elif isinstance(seq, Seq): return Seq(prot, protein) elif isinstance(seq, SeqRecord): return SeqRecord(Seq(prot, protein), id=seq.id, name=seq.name, description=seq.description) else: import numpy as np return np.fromstring(prot, 'S1')
def get_translation(self,sequence=None): translation = None seq = sequence if sequence else self.get_spliced_seq() if seq: seqlen = len(seq) / 3 * 3; if seqlen >= 3: translation = translate(seq[:seqlen]) return translation
def main(filename): with open(filename) as fin: dna, prot = fin.read().strip().split('\n') res = 1 while translate(dna, table=res, stop_symbol='') != prot: res += 1 print res
def check_translation(sequence, translation, table=None): if table is None: t = 1 else: t = table if translation != str(sequence.translate(t)) \ or translation != str(translate(sequence, t)) \ or translation != translate(str(sequence), t): # More details... for i, amino in enumerate(translation): codon = sequence[i * 3:i * 3 + 3] if amino != str(codon.translate(t)): raise ValueError("%s -> %s not %s (table %s)" % (codon, amino, codon.translate(t), t)) # Shouldn't reach this line: raise ValueError("%s -> %s (table %s)" % (sequence, translation, t)) return True
def translate_read(self, read, start, end): trimmed_read, offset = self.trim_read(read, start, end, codon=True) prot_seq = translate(trimmed_read[offset:]) prot_start = (read.pos + offset - start) / 3 if prot_start < 0: prot_start = 0 return prot_seq, prot_start
def start_chop_and_trans(s, strict=True): """Returns offset, trimmed nuc, protein.""" if strict: assert s[-3:] in stops, s assert len(s) % 3 == 0 for match in re_starts.finditer(s): # Must check the start is in frame start = match.start() if start % 3 == 0: n = s[start:] assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) if strict: t = translate(n, options.table, cds=True) else: # Use when missing stop codon, t = "M" + translate(n[3:], options.table, to_stop=True) return start, n, t return None, None, None
def frame(self, seq, frame, translation_table = 1): if not ((-3 <= frame <= -1) or (1 <= frame <= 3)): frame = 1 if frame != 1: raise NotImplementedError #TODO - Support the frame argument #The old code didn't, but I can guess from #the code the expected 1,2,3 for the forward #strands and -1,-2,-3 for the reverse. return translate(seq, table=translation_table)
def dNdS(self, reference, start, end): ps, pn = 0, 0 n_codon = (end - start) / 3 for i in range(start, end, 3): ref_codon = reference[i-start:i-start+3] ref_aa = translate(ref_codon) s_i, n_i = self._dNdS_sites(ref_codon) if s_i == 0: continue m_i = 0 inner_s, inner_n = 0, 0 reads = self.samfile.fetch('CONSENSUS_B_GAG_POL', start, end) for read in reads: trimmed_read = self.trim_read(read, i, i+3, codon=False) if len(trimmed_read) < 3: continue m_i += 1 cur_pos = read.pos - i if cur_pos < 0: cur_pos = 0 sij, nij = 0, 0 for j, nt in enumerate(trimmed_read): if nt == ref_codon[j]: continue mut_codon = ref_codon[:j] + nt + ref_codon[j+1:] if translate(mut_codon) == ref_aa: sij += 1 else: nij += 1 inner_s += sij / s_i inner_n += nij / n_i ps += inner_s / m_i pn += inner_n / m_i ps /= float(n_codon) pn /= float(n_codon) ds = -.75 * np.log(1 - 4*ps/3) dn = -.75 * np.log(1 - 4*pn/3) print ds/dn
def _dNdS_sites(self, codon): syn = 0 non = 0 alphabet = 'ACGT' aa = translate(codon) if len(codon) < 3: return syn, non for i in range(3): for mut in alphabet: if mut == codon[i]: continue mut_codon = codon[:i] + mut + codon[i+1:] syn_flag = (aa == translate(mut_codon)) syn += syn_flag non += (not syn_flag) syn /= 3. non /= 3. assert syn + non == 3 return syn, non
def transl(seq): out_seq = "" for codon in codons(seq): if codon == '---': aa = '-' elif codon == '...': aa = '.' else: aa = translate(codon) out_seq += aa return out_seq
def six_frames(seq, genetic_code=1): ''' input a DNA sequence pad to a whole number of codons if required probably only works for ambiguous alphabet sequences return the six possible protein translations ''' rev = reverse_complement(seq) frames = {} for i in [0,1,2]: l = len(seq) - i j = i + l - l%3 frames['%+d'%i] = translate(seq[i:j],genetic_code) frames['-%d'%i] = translate(rev[i:j],genetic_code) return frames
def writeSTF(): global difference, seqRecordToCheck, seqRecordToCheckComplement, variation, featureName, featureSeq, seqLength, m difference = len(record.seq) % 3 seqRecordToCheck = str(record.seq) if difference != 0: seqRecordToCheck = str(record.seq)[:-difference] else: seqRecordToCheck = str(record.seq) seqRecordToCheckComplement = str(reverse_complement(seqRecordToCheck)) # Reading Frames firstReadingFrame = translate(seqRecordToCheck) secondReadingFrame = translate(seqRecordToCheck[1::] + seqRecordToCheck[0]) thirdReadingFrame = translate(seqRecordToCheck[2::] + seqRecordToCheck[0:2]) # Reading Frames (reverseComplement) firstReadingFrameComplement = translate(seqRecordToCheckComplement) secondReadingFrameComplement = translate(seqRecordToCheckComplement[1::] + seqRecordToCheckComplement[0]) thirdReadingFrameComplement = translate(seqRecordToCheckComplement[2::] + seqRecordToCheckComplement[0:2]) for variation in featureStatistic_container[feature]: featureName = variation.note featureSeq = str(variation.seq) featureLength = len(variation.seq) seqLength = len(seqRecordToCheck) firstReadingFrameCircular = firstReadingFrame + firstReadingFrame[0:featureLength - 1] secondReadingFrameCircular = secondReadingFrame + secondReadingFrame[0:featureLength - 1] thirdReadingFrameCircular = thirdReadingFrame + thirdReadingFrame[0:featureLength - 1] firstReadingFrameComplementCircular = firstReadingFrameComplement + firstReadingFrameComplement[ 0:featureLength - 1] secondReadingFrameComplementCircular = secondReadingFrameComplement + secondReadingFrameComplement[ 0:featureLength - 1] thirdReadingFrameComplementCircular = thirdReadingFrameComplement + thirdReadingFrameComplement[ 0:featureLength - 1] # Find Matches firstFrameMatchesCircular = re.finditer(featureSeq, firstReadingFrameCircular) secondFrameMatchesCircular = re.finditer(featureSeq, secondReadingFrameCircular) thirdFrameMatchesCircular = re.finditer(featureSeq, thirdReadingFrameCircular) firstFrameComplementMatchesCircular = re.finditer(featureSeq, firstReadingFrameComplementCircular) secondFrameComplementMatchesCircular = re.finditer(featureSeq, secondReadingFrameComplementCircular) thirdFrameComplementMatchesCircular = re.finditer(featureSeq, thirdReadingFrameComplementCircular) for m in firstFrameMatchesCircular: addFeatureSTF() for m in secondFrameMatchesCircular: addFeatureSTF() for m in thirdFrameMatchesCircular: addFeatureSTF() for m in firstFrameComplementMatchesCircular: addFeatureComplSTF() for m in secondFrameComplementMatchesCircular: addFeatureComplSTF() for m in thirdFrameComplementMatchesCircular: addFeatureComplSTF()
def ptra(): with open("rosalind_ptra.txt") as f: dna = f.readline().strip() prot = f.readline().strip() print dna print prot table = 1 while translate(dna, table=table, to_stop=True) != prot: table += 1 print table
def align_codon_pairwise(seqstr, refstr, **kwargs): '''Pairwise alignment via codons Parameters: **kwargs: passed down to SeqAn alignment function ''' from Bio.Seq import translate from seqanpy import align_global from itertools import izip if len(seqstr) % 3: raise ValueError('The length of the first sequence is not a multiple of 3') elif len(refstr) % 3: raise ValueError('The length of the second sequence is not a multiple of 3') seqpr = translate(seqstr) refpr = translate(refstr) (score, alis, alir) = align_global(seqpr, refpr, **kwargs) aliseq = [] aliref = [] poss = 0 posr = 0 for aas, aar in izip(alis, alir): if aas == '-': aliseq.append('---') else: aliseq.append(seqstr[poss: poss+3]) poss += 3 if aar == '-': aliref.append('---') else: aliref.append(refstr[posr: posr+3]) posr += 3 aliseq = ''.join(aliseq) aliref = ''.join(aliref) return (aliseq, aliref)
def writeBothToFile(seqFileName, newSeqLength, randFileName, transFileName): randOutFile = open(randFileName, "w") randOutFile.write("> Randomly generated gene of length %d \n" % int(newSeqLength)) transOutFile = open(transFileName, "w") transOutFile.write("> AA translation and untranslation of randomly generated gene of length %d\n" % int(newSeqLength)) sequence = generateRandomGene(seqFileName, newSeqLength) randOutFile.write(sequence) seqPerm = untranslate(translate(sequence)) transOutFile.write(seqPerm) randOutFile.close() transOutFile.close()
def get_protein(record, feature): protein = {} protein['identifier'] = fasta_identifier(flatten(feature.qualifiers)) protein['description'] = mygetattr(feature.qualifiers, 'product', '') # prefer the annotated translation versus our own one if feature.qualifiers.has_key('translation'): # what if more than one translation? protein['sequence'] = feature.qualifiers['translation'].pop(0) else: protein['sequence'] = translate(get_seq_0_based(record.seq.data, feature.location.nofuzzy_start, feature.location.nofuzzy_end, feature.strand)) if not len(protein['sequence']): print >>sys.stderr, "could not translate %s" % (protein['identifier']) return protein
def test_the_translation_of_stops(self): """Check obj.translate() method with stop codons.""" misc_stops = "TAATAGTGAAGAAGG" for nuc in [Seq(misc_stops), Seq(misc_stops, generic_nucleotide), Seq(misc_stops, generic_dna), Seq(misc_stops, unambiguous_dna)]: self.assertEqual("***RR", str(nuc.translate())) self.assertEqual("***RR", str(nuc.translate(1))) self.assertEqual("***RR", str(nuc.translate("SGC0"))) self.assertEqual("**W**", str(nuc.translate(table=2))) self.assertEqual("**WRR", str(nuc.translate(table='Yeast Mitochondrial'))) self.assertEqual("**WSS", str(nuc.translate(table=5))) self.assertEqual("**WSS", str(nuc.translate(table=9))) self.assertEqual("**CRR", str(nuc.translate(table='Euplotid Nuclear'))) self.assertEqual("***RR", str(nuc.translate(table=11))) self.assertEqual("***RR", str(nuc.translate(table='11'))) self.assertEqual("***RR", str(nuc.translate(table='Bacterial'))) self.assertEqual("**GRR", str(nuc.translate(table=25))) self.assertEqual("", str(nuc.translate(to_stop=True))) self.assertEqual("O*ORR", str(nuc.translate(table=special_table))) self.assertEqual("*QWRR", str(nuc.translate(table=Chilodonella_uncinata_table))) # These test the Bio.Seq.translate() function - move these?: self.assertEqual("*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table)) self.assertEqual("O*ORR", translate(str(nuc), table=special_table)) self.assertEqual("", translate(str(nuc), to_stop=True)) self.assertEqual("***RR", translate(str(nuc), table='Bacterial')) self.assertEqual("***RR", translate(str(nuc), table='11')) self.assertEqual("***RR", translate(str(nuc), table=11)) self.assertEqual("**W**", translate(str(nuc), table=2)) self.assertEqual(str(Seq("TAT").translate()), "Y") self.assertEqual(str(Seq("TAR").translate()), "*") self.assertEqual(str(Seq("TAN").translate()), "X") self.assertEqual(str(Seq("NNN").translate()), "X") self.assertEqual(str(Seq("TAt").translate()), "Y") self.assertEqual(str(Seq("TaR").translate()), "*") self.assertEqual(str(Seq("TaN").translate()), "X") self.assertEqual(str(Seq("nnN").translate()), "X") self.assertEqual(str(Seq("tat").translate()), "Y") self.assertEqual(str(Seq("tar").translate()), "*") self.assertEqual(str(Seq("tan").translate()), "X") self.assertEqual(str(Seq("nnn").translate()), "X")
def translations(self): """ Yield all six translations of a nucleotide sequence. @return: A generator that produces six L{TranslatedRead} instances. """ rc = self.reverseComplement().sequence for reverseComplemented in False, True: for frame in 0, 1, 2: seq = rc if reverseComplemented else self.sequence # Get the suffix of the sequence for translation. I.e., # skip 0, 1, or 2 initial bases, depending on the frame. # Note that this makes a copy of the sequence, which we can # then safely append 'N' bases to to adjust its length to # be zero mod 3. suffix = seq[frame:] lengthMod3 = len(suffix) % 3 if lengthMod3: suffix += ('NN' if lengthMod3 == 1 else 'N') yield TranslatedRead(self, translate(suffix), frame, reverseComplemented)
def apply_operation(): """Do the selected operation.""" codon_table = codon_list.get(codon_list.curselection()) print('Code: {}'.format(codon_table)) seq = ''.join(input_text.get(1.0, tk.END).split()) print('Input sequence: {}'.format(seq)) operation = transform_var.get() print('Operation: {}'.format(operation)) if operation == 'transcribe': result = transcribe(seq) elif operation == 'translate': result = translate(seq, table=codon_table, to_stop=True) elif operation == 'back transcribe': result = back_transcribe(seq) else: result = '' output_text.delete(1.0, tk.END) output_text.insert(tk.END, result) print('Result: {}'.format(result)) return
def process_seq(header, seq): hits = 0 id = header if id.count(" ") > 0: id = id[:id.index(" ")] seq = Seq(seq) # direction1 is the direction we originally have had, 2 is the antisense strand # then TRANSLATE ALL POSSIBLE ORFs, do not stop at STOP codons dna_sequence_direction1 = seq dna_sequence_direction2 = dna_sequence_direction1.reverse_complement() translations = {} translations['+1'] = translate(dna_sequence_direction1) translations['-1'] = translate(dna_sequence_direction2) translations['+2'] = translate(dna_sequence_direction1[1:]) translations['-2'] = translate(dna_sequence_direction2[1:]) translations['+3'] = translate(dna_sequence_direction1[2:]) translations['-3'] = translate(dna_sequence_direction2[2:]) # get all polypeptides between stops, filter out those shorter than minlength polypeptides = {} for frame, translation in translations.iteritems(): peptides = translation.split('*') if int(frame) < 0: startpos = len(seq) +1 + int(frame) else: startpos = int(frame) #print >> sys.stderr, "frame: %s | startpos: %s | scaffold length: %s" %(frame, startpos, len(seq)) #print >> sys.stderr, "# peptides: %s | pep.length: %s | transformed length: %s | scaffold length: %s" %(len(peptides), sum([len(pep) for pep in peptides]), (sum([len(pep) for pep in peptides])+len(peptides))*3, len(seq)) for peptide in peptides: peptide += '*' if int(frame) < 0: stoppos = startpos +1 - (3*len(peptide)) else: stoppos = startpos -1 + (3*len(peptide)) polypeptides[str(startpos)+':'+str(stoppos)] = peptide.tostring() if int(frame) < 0: startpos = stoppos-1 else: startpos = stoppos+1 for key, pepseq in polypeptides.iteritems(): if len(pepseq) < args['minlength']: continue startpos, stoppos = [int(e) for e in key.split(":")] hits += 1 print ">%s[%s:%s]" %( id, startpos, stoppos ) print pepseq return hits
def translate_dna_prot(dna,prot): for i in ncbi_ids: if translate(dna, stop_symbol="",table=i) == prot: return i
def frame(self, seq, frame, translation_table=1): if frame < 0: seq = reverse_complement(seq) seq = seq[(abs(frame) - 1):] return translate(seq, table=translation_table)
# if we want to transcribe from the template strand (3' -> 5'): transcribe(template_dna.reverse_complement()) # transcribing back to DNA: from Bio.Seq import Seq, back_transcribe back_transcribe( messenger_rna) # just changes U -> T and gives the coding strand # 3.8 Translation (mRNA -> Protein) # Uses standard genetic code from Bio.Seq import Seq, translate from Bio.Alphabet import IUPAC messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", IUPAC.unambiguous_rna) translate(messenger_rna) # Direct translation (DNA -> Protein from Bio.Seq import Seq, translate from Bio.Alphabet import IUPAC coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna) translate(coding_dna) # we can specify other translation tables by name translate(coding_dna, table="Vertebrate Mitochondrial") # or by NCBI number translate(coding_dna, table=2) # 3.9 Transcription and Translation
def six_frame_translations(seq, genetic_code=1): """Return pretty string showing the 6 frame translations and GC content. Nice looking 6 frame translation with GC content - code from xbbtools similar to DNA Striders six-frame translation >>> from Bio.SeqUtils import six_frame_translations >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) GC_Frame: a:5 t:0 g:8 c:5 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC <BLANKLINE> <BLANKLINE> 1/1 G H C N G P L W P L * W A A M A I V M G R * auggccauuguaaugggccgcuga 54 % uaccgguaacauuacccggcgacu A M T I P R Q H G N Y H A A S P W Q L P G S <BLANKLINE> <BLANKLINE> """ # noqa for pep8 W291 trailing whitespace from Bio.Seq import reverse_complement, reverse_complement_rna, translate if "u" in seq.lower(): anti = reverse_complement_rna(seq) else: anti = reverse_complement(seq, inplace=False) # TODO: remove inplace=False comp = anti[::-1] length = len(seq) frames = {} for i in range(0, 3): fragment_length = 3 * ((length - i) // 3) frames[i + 1] = translate(seq[i : i + fragment_length], genetic_code) frames[-(i + 1)] = translate(anti[i : i + fragment_length], genetic_code)[::-1] # create header if length > 20: short = "%s ... %s" % (seq[:10], seq[-10:]) else: short = seq header = "GC_Frame:" for nt in ["a", "t", "g", "c"]: header += " %s:%d" % (nt, seq.count(nt.upper())) header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % ( short.lower(), length, GC(seq), ) res = header for i in range(0, length, 60): subseq = seq[i : i + 60] csubseq = comp[i : i + 60] p = i // 3 res += "%d/%d\n" % (i + 1, i / 3 + 1) res += " " + " ".join(frames[3][p : p + 20]) + "\n" res += " " + " ".join(frames[2][p : p + 20]) + "\n" res += " ".join(frames[1][p : p + 20]) + "\n" # seq res += subseq.lower() + "%5d %%\n" % int(GC(subseq)) res += csubseq.lower() + "\n" # - frames res += " ".join(frames[-2][p : p + 20]) + "\n" res += " " + " ".join(frames[-1][p : p + 20]) + "\n" res += " " + " ".join(frames[-3][p : p + 20]) + "\n\n" return res
def dna_to_amino_acid(dna_seq): return translate(dna_seq)
from Bio.Seq import translate with open("rosalind_prot.txt") as p: myfile = p.read() print(translate(myfile, stop_symbol=""))
def translation(DNA): return translate(DNA, to_stop=True)
import warnings from Bio.Alphabet import IUPAC from Bio.Seq import Seq, translate warnings.filterwarnings('ignore') with open('rosalind_orfr.txt', 'r') as f: seq_text = f.read().strip() seq = Seq(seq_text, IUPAC.unambiguous_dna) DNAs = [seq, seq.reverse_complement()] longest_protein = '' for i in range(3): for DNA in DNAs: protein = translate(DNA[i:], to_stop=True) if len(longest_protein) < len(protein): longest_protein = protein print(longest_protein)
from Bio.Seq import translate DNA_seq=open("rosalind_orfr.txt","r").read().rstrip() rc_seq = DNA_seq.replace("A", 't').replace("C", 'g').replace("G", 'c').replace("T", 'a')[::-1].upper() forward_orf_ind =[i for i in range(len(DNA_seq)-3+1) if DNA_seq[i:i+3] == 'ATG'] reverse_orf_ind =[i for i in range(len(rc_seq)-3+1) if rc_seq[i:i+3] == 'ATG'] list_of_AASeq = [] for idx in forward_orf_ind: list_of_AASeq.append(translate(DNA_seq[idx:], to_stop=True)) for idx in reverse_orf_ind: list_of_AASeq.append(translate(rc_seq[idx:], to_stop=True)) print max(list_of_AASeq, key=len)
def check_emboss_translate(self, sequence, table=None, frame=None): """Call transeq, returns protein sequence as string.""" # TODO - Support transeq in Bio.Emboss.Applications? # (doesn't seem worthwhile as Biopython can do translations) # Setup, cline = exes["transeq"] if len(sequence) < 100: filename = None cline += " -sequence asis:%s" % sequence else: # There are limits on command line string lengths... # use a temp file instead. filename = "Emboss/temp_transeq.txt" SeqIO.write(SeqRecord(sequence, id="Test"), filename, "fasta") cline += " -sequence %s" % filename cline += " -auto" # no prompting cline += " -filter" # use stdout if table is not None: cline += " -table %s" % str(table) if frame is not None: cline += " -frame %s" % str(frame) # Run the tool, child = subprocess.Popen( str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "win32"), ) out, err = child.communicate() msg = "cline='%s'" % cline # Check no error output: self.assertEqual(err, "", msg=msg) # Check we could read its output record = SeqIO.read(StringIO(out), "fasta") result = child.wait() self.assertEqual(result, 0, msg=msg) if filename: os.remove(filename) self.assertTrue(record.id.startswith("Test"), msg=msg) else: self.assertTrue(record.id.startswith("asis"), msg=msg) translation = record.seq if table is None: table = 1 self.assertEqual(translation, sequence.translate(table)) self.assertEqual(translation, translate(sequence, table)) self.assertEqual(translation, translate(str(sequence), table)) # More details... for i, amino in enumerate(translation): codon = sequence[i * 3 : i * 3 + 3] msg = "codon %s, table %s" % (codon, table) self.assertEqual(amino, codon.translate(table), msg=msg)
def dna2prot(seq): """Translate DNA sequence to protein sequence""" return translate(Seq(seq)).tostring()
def collect_ccds_record(listObject, data_dict, rev=True): orderCCDS = dict() record_with_frame = dict() record_original = dict() new_gene_list = dict() for geneName in listObject: record_with_frame[geneName] = list() record_original[geneName] = list() try: ccds_object = data_dict[geneName] except KeyError: continue if rev == True: ccds_positions = sorted([(int(x.split("-")[0]), int(x.split("-")[1])) for x in ccds_object["pos"]])[::-1] else: ccds_positions = sorted([(int(x.split("-")[0]), int(x.split("-")[1])) for x in ccds_object["pos"]]) orderCCDS[geneName] = ccds_positions remaining = 0 first_flag = False for seq_coord in ccds_positions: flagThing = False while flagThing != True: try: handle_in = Entrez.efetch(db="nucleotide", id=ccds_object["id"], rettype="fasta", strand=+1, seq_start=seq_coord[0] + 1, seq_stop=seq_coord[1] + 1) record = SeqIO.read(handle_in, "fasta") flagThing = True except (IOError, httplib.HTTPException): continue if first_flag == False: if len(record.seq) % 3 != 0: if rev == True: sequenceObj = record.seq.reverse_complement() + Seq( "N" * (3 - len(record.seq) % 3), generic_dna) record_original[geneName].append( [record.id, str(sequenceObj)]) else: sequenceObj = record.seq + Seq( "N" * (3 - len(record.seq) % 3), generic_dna) record_original[geneName].append( [record.id, str(sequenceObj)]) inseq = translate(sequenceObj) if "*" in inseq: new_gene_list.append(geneName) break remaining = len(record.seq) % 3 record.seq = inseq else: if rev == True: sequenceObj = record.seq.reverse_complement() record_original[geneName].append( [record.id, str(sequenceObj)]) else: sequenceObj = record.seq record_original[geneName].append( [record.id, str(sequenceObj)]) remaining = len(record.seq) % 3 record.seq = translate(sequenceObj) first_flag = True elif first_flag == True: if remaining != 0: if rev == True: sequenceObj = Seq( "N" * (remaining), generic_dna) + record.seq.reverse_complement() else: sequenceObj = Seq("N" * (remaining), generic_dna) + record.seq remaining = len(sequenceObj) % 3 if len(sequenceObj) % 3 != 0: sequenceObj = sequenceObj + Seq( "N" * (3 - len(sequenceObj) % 3), generic_dna) record_original[geneName].append( [record.id, str(sequenceObj)]) inseq = translate(sequenceObj) record.seq = inseq elif len(record.seq) % 3 != 0: if rev == True: sequenceObj = record.seq.reverse_complement() + Seq( "N" * (3 - len(record.seq) % 3), generic_dna) else: sequenceObj = record.seq + Seq( "N" * (3 - len(record.seq) % 3), generic_dna) record_original[geneName].append( [record.id, str(sequenceObj)]) inseq = translate(sequenceObj) remaining = len(record.seq) % 3 record.seq = inseq else: if rev == True: sequenceObj = record.seq.reverse_complement() else: sequenceObj = record.seq record_original[geneName].append( [record.id, str(sequenceObj)]) remaining = len(sequenceObj) % 3 record.seq = translate(sequenceObj) if record.seq.count("*") > 1: new_gene_list.append(geneName) break print(geneName, record.id, record.seq) record_with_frame[geneName].append(record) handle_in.close() return record_with_frame, set(new_gene_list), orderCCDS, record_original
from Bio import SeqIO from Bio.Seq import Seq, transcribe, translate import sys file_out = 'gene_seq_out.fasta' if __name__ == '__main__': # filenames_list = list of entered arguments from command line. file_in = sys.argv[:1] #Open output file and write with open(file_out, 'w') as f_out: #as file_in is a list, could have more than one file so we need to iterate. for filename in file_in: #my_sequence_recorded_iterable is a list, each sequence of the file will have a sequence record, #the record is composed by seq, id, name and description. my_sequence_recorded_iterable = list( SeqIO.parse(open(filename, mode='r'), 'fasta')) # if the length of the list is empty print a warning and write to stderr. if len(my_sequence_recorded_iterable) == 0: print(f'WARNING file: {filename}: Empty file', file=sys.stderr) #now we will iterate on my_sequence_recorded_iterable list, with seq_record elements for seq_record in my_sequence_recorded_iterable: #seq_record.description=' '.join(seq_record.description.split()[1:]) in here I edit the description # as it originally contains the id. # we now write each iterated sequence 'seq_record' in our output file f_out as fasta format. SeqIO.write(seq_record, f_out, 'fasta') #We now print to stdoout. print('>' + seq_record.id) print(translate(seq_record.seq))
#!/usr/bin/python ''' translate nucleotide sequences into protein sequences ''' import sys,argparse import rjvbio.seq import Bio.SeqIO,Bio.SeqRecord from Bio.Seq import translate, Seq ap = argparse.ArgumentParser(description=__doc__,formatter_class=argparse.ArgumentDefaultsHelpFormatter) ap.add_argument('--inp',nargs='+',required=True,type=str,help='input sequence file(s)') ap.add_argument('--inpformat',default='fasta',type=str,help='format of input file(s), eg fasta,fastq,genbank see http://biopython.org/wiki/SeqIO#File_Formats for details') ap.add_argument('--out',default='STDOUT',type=str,help='output FASTA file') conf = ap.parse_args() if conf.out == 'STDOUT': fout = sys.stdout else: fout = open(conf.out,'wb') for fname in conf.inp: for rec in Bio.SeqIO.parse(fname,conf.inpformat): seq = translate(rec.seq) newrec = Bio.SeqRecord.SeqRecord(seq, id=rec.id,description='') Bio.SeqIO.write(newrec,fout,"fasta") if conf.out != 'STDOUT': fout.close()
# (at your option) any later version. # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with GNU Emacs. If not, see <http://www.gnu.org/licenses/> from Bio.Seq import translate import sys with open('c:/Users/Weka/Downloads/rosalind_ptra(2).txt') as f: i = 0 for line in f: print(line.strip()) if i == 0: coding_dna = line.strip() if i == 1: out = line.strip() for table in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15]: print(table) translated = translate(coding_dna, table=table, to_stop=True) # if len(translated)==len(out): if translated == out: print(table, translated) sys.exit() print('Not matched: ' + translated) i += 1
def translate(self, codon_table): seq = "".join(self.src_text.GetValue().split()) # remove whitespace print(seq) self.dest_text.Clear() self.dest_text.SetValue(translate(seq, table=codon_table, to_stop=True))
from Bio.Seq import translate dna = "ATGCAGACCAAACACGAGCAGTTCCGTGCGCAATCCGCGACCGCCGGATCTTCTAGTCACTGGACCACCTCGACCACCTTCGGAACCGAGTCTCTGGCGGATAAGCGTGGGCTGGATATCCCGACGTCCCGATTCGCAGTTACACCAGTATCCGCCAACGGACACGATGGTTTTGAGATCGCTCACTCCGACGCTCCTCGCCACCAAGGCGCAATATTGCCGAACACAGCCGTGAAAGCAACCTGCCTTTTTGATGAGCTAGTTACAGTCACTGTGAATCCGCCGCGCCGAAAGCTGATTTATTCCGGTTACCTAGAATGGATGAATATACTCGAACATTATTATGAAACAGTATCTGGTCCTAAGTTGGCGTTGTATATTTATCTGCCCTGTCTCGTCACTGGATCAACTTCCTGTATCGTTAACTCTCCTTGATCTCTTATCAATATTGGGGCTCTAAAGCTACGTTTTCGAGTGGGAAACGCTGCGGCTCCTTATGTGAAGCGCAAAAGTCTTTTAACGATAAGTTGCATCACGACACACCCGGTGCAGAAAGAATCTTCCCCACCCTCCCGTGGTGAATGTAAACCTGGGGCCGGTTATCCCGGTGGTGAATACTTGAATATCCATCCTGGATTAGTGGTACGCACTCCTGGTAGCGCCCGTCCCTTATACTTACCGTGAGATTCTTCGTCGACCTCTGGCGTCGCCAATAAGTACAAATGCTTCAGTGCTTCTAACATTGACGATAACGGAAAACTTGTCGGAGAGGCGGGGTTACGGACACCGAACTTAACGCCTTACCATATGTTACAATTACGATCGGCCATCCCAATCTGCACACTCGATAAACGCCAAACGAGTGCAAACCCGAGTAGCATTAGCCTTCCCGCTAAACTGCCGCGAATCACCGCCGCCATCTGCCCCTTCCGGGAGCAATCTTCCTCATGAAATAAACCACCGCCGTCGAGTCTATGCACACTATGAGTCCTCTGGAAGTACCTGATGGGCTGCCGGGAAAATCCCCCCGCGTTGAGTGCGGATCAGCGTTGAAATCCGGCTCACGGCTCACTGGTGAGTATCTTCACATGAACATGGGGTCACGCTCGTGTAGACCTGGCGGACGTCGTGGACGAGCCACCTATAGTCAATGGGCTAGCAGAAACCATACCTCAATATTCAAACAACACGACGTACGGGGATAATCTACTGCCGACGAGTATGGCGTACGATGAACCTACCTGAATCGCCACGCTCATATTACTTACGGGGTGTTGAGTCCTCCTTTACTTGAGCACTAGCCCACACGAGTTGTTGTTAGTCGATTCCCTCGTTCTCAGTGACTTAGACTTGTCCACATGCCCTGAGAAGAGCAACGACGGTGAATACCCTTGCCGCTATATAAAGCCGCGGGAAAAAGACCTGGGCACCGTTAAACAGTCGTTTGCAATTTCTTTACGCCTATTAATGTCAAACCATTGGATGGCCCCGTCGCCGGAGGGGCCGAAAAGTTTGCGGTCTTATGGCCTCTTTACTCGGAACTGGAGTATTTATTCACCCTGACTTGCGCCTGTATATGGCTATTCCACATTCTTTAATATTCATTCGGTCGAGCTTCCGTTGCCCTCTAAGTCACCACTGTCCATCGTGTCCTCAACTAGCCTCTTATCCGTCTGTCTACGTACTACAACCCGTTCGGGGTGATACTATGATTGGATACTTTCACTGCTAGGCTGGCAGGATTCGTTCAACTATTTGTTCCAGGCCACGGCGACGTCCGACGTAAACGAGCTTTGTCGAAAATGAATCTTTTGGTCTTTACTACGATGCTACCCAACAGAAGATAAGAATGGGAACAATTGAAGTCCATTTGCCATTAATCCTCCGGCTAAAAAATCGGCGAGTCCCGTACCCCCCTCATTTGTAGACTGAGACGAGGTTGTCACCATTCCACCTGATGGCATCTACTCCCAAGCCTCATTTGGGGAGGCTCTGGGACTTAATAGCGCGCCTAACCGAATTCGCAACGCTATTCCCAATAAGGCGGAGCGAACACCTACCTCGATAGGATCCATTAGCTGTGTAGATGACCACATGGCGGAGATTACGGGACAACAGAGCCCCTTAAATCGGTATGAAGTAGCAGGTTGATACCCTGCTGAAACTTGAATCTATGTAAAGGACAAGGTTAATGCTTACATCTGTTTGGCTCACCTGTTCCATAGTCATCCTTCGGTATCTTATCTCAGCGGGCCTAACACATACCGTCTCATCCGGTCAATAAATTCGCGGTCAGAAATTCTATGGCCCGCTTATATCCCCCCGGTCGGTTGCCACCGCAGCATGATTCAAGGATCACTTCCCGATCAACACGCACATCTGCGTACGTGCTGTCAACTACTAAGTAGTGCTGCACGCGACCTGGTTTTGAGTGGCCTAGGGTTTCCGGTTAACTGATATTTACACACATGCAGTCATTCTTGGATCCCCACGCGAAAGAATCAGAGTATGTGGGATCTAGGTACGGTGTCGGGTCGACTACAAGTGGGCCTAGGGTGCTCATCAGGCCGGGGTATGGGTTCCTACAAAGGGCAGGAGGGCATGAACGCATTTTCACGTGAGGGTGTGCGCCGACAGGGAAATCCGCGAGATCATGCGCGTGAAGTTCACCGGAACGCGACACGATCAATCCACTGTAGCGAACGTGTGACGGGTACTCACGAACTCGGTTGCCCCGCGTACATCCCTATACTCCGGCCCCATGATTTGAAACATGGCAATCCACACCGCTGCGATTACGAAGGACCCCTTAGTCCTCGTGCAACCCTGAGCGACGTGCGCTGAGTAAGCTGCCGCAGCTATGAATTAATATCTGATGTGGGACCCGCGGGAGTGTTTGAGCGATCGACCACGTATATTGTGTCACTGGCCACCGGGAACCTCTGTCCACGACTTGAAAACATTCGGGTTGGGCGCCGAACTCTAGTTGCCATGGATATGTCGCGGGTTATATTAAAACGCGACACAGCTAACGGGTGCCTCACACGCGCTGCTATGACTGACTTGAGCTCTTGACAACAAGGAATCGTATTCCGGTCATCTTTCTTGACAGAGCGACACAACGACTGCTTCCTCGGTGACACCCGGCTTGAGTGGCAGGAGGTCGTCGCACTACCTGCTCATTTTTCCGCTAAAAACAATGTACCGCTGACACACTACGCTAGTCAACAGTGCTATTATTTACAGCCCTGCTTGGGGACCGATATAATTGCTGTCAGTTTCACCGATCCAAAGGGGCGTCCAGCACCGGGAACTACGAGCCTCCCAGACGACCGTCGGGCCCGCTCTGGCACTAAGTTCGATGAGGATGCCAATTCCGCCAAACCGGACGACCGCATACCCAAGGATCGTTCTTTACCTAAGCTAACCTCACTGGGGTTCTGCGTGACTCAATCACCATATGTTTTTTTTGTTGAAGTGTCAGAACATTCCTGGTGCTTGATGTCTTACCATTTGTGACCCTCGTGTTCTTTGTATAACATGGGCACGTTAGCAATTATGCTTCCCGAACGACCTCTAGCCGGAAAGCTCGCTCGACCCGGAATTAAATGTCCGGCGGTCCGGGAAATCAAGTTTCGTCCAATCGGTTGCGGGATCTTTGTAAAGGTCCCCTTCTCTCGATGTGCGCGTGGTCAATTCTCGGGGAACTTCGTCGGGTCCCGATGGCCGTTCGGATATTCGGCCCGAGGCGACATCGCGCGTAAATGATATGAGTTGGACCCGTCGGACGGCACGTCGCGCTTCCCAGTGTGACTGCACTTTGTCATAAGCAGTGGCTCTCACCCGGGAAAGTGTTTGGACCCGCAACAGCAGTTTGAAGTACACTTCGCCGGGACGTTTTCTTTATTCGTGGTTACCACAGCGAGCTACTGAGACTGGGGGCCTCAATGCAATCTATGTATCAGTGAATGTTGTGGCCAAGAGCTCTCCTTATGTCGTGATAACGTTATTTATTTCACGGAAACACAAGACATGTGGCTGTGTGCTGGATTACAGGGGGCTGCTCGCCTCATAACACTTTGAGAGCAGTCTGTAGCCGAGAGCAGTACAGTTCCCTGATATCGGTGAATTGTCCGATGTCCCTATAAACCAGACGGATTTTTGGGGTTTGCAGACCACGGTCCATACATACTATATTATCGCATTTCCGGGGGCGCCAGCGATACTAACTCGGAACATCGTGGAGGGGAACTCATGTCAGAGCGAGGCGTTAACGAAGACTATTCATCGATATTCGATATGAGTTCGTGTGGGAACGACCAGCGCAAGATGCGCAATCCGGGATCCTGATGGTACTATGAAGTCATACCTCGTGCAGGTGTCCGTCTTTTGCGTATCTGGTGCAGCGTAGGGCTATGTTATCCCTCGGGACATAATAGTATCCCAGCTAGTAGTGACATCTTTAGCCCCGGGACTGTGCAATTATCATGAACTCAAGTTCTACTCTCTCACTCTCCAAAGACCAATTGGGTAAAATTTATTTCCCGCCTGTCAGACTTGTTGGCTCAAAGCTTCTGCATAAAGGCAAGCAGCGCATCATGAAATGGCGGCCTATGGGCGGACAAGGCGGTTACTGCCATCTATCGTATGTTTAGCGCTGACCTGTCGCGAGGCATTGCTGGATTAACTTTAACAGGAAATCGAACCATAAATCGGTTGGATCTTTCGTTGATCCACAGCTTCGGGGCCACTTCCATGTACTGCGACTCACGTTTTATGTGATTTCCTGCTTACGTGGTATATGGCGCCGTCCGGCGATGAACTAACTACGAGACTAGCGGTTGAACATTCCACTACGCTTCGGTAGCCATTACTTGTGCTAATACCTTTGTCGGATCTCAAGGTATACTCGGTTTCTCCAATATATCATTGCTGCCGCTCGACCGGCCCGTGGCGTCCCCACCCCTATACCGGATGTATGAGCGTTCTGGGAGTAAAGCGCGCCACACACCGAGCCCATTCGACGCGCAAAATGACGGACGTCATCATCGGTCACTTACCTTCGAGTTGTTTTCGGGAGTTGGCTTATTAACTTGATTCAGTGGGTTACAGCGGGAACTAGACCAGCACGCCACTGACCTATTCATAATAGCTGTGGAACCGATGATCATACTGTTGTTTGCTTATATGGTAGAATTTCTGTCCAGTGGGCTGAATCTAAGCATTGCGCGATTTGCTGAAGAGCCCACCCGGTCTTGAATTATGCTACCCGCCTCCTGCGAGGATGATCTATGAATGCGATCGCGTAGCTCAGAATTTTGATCCCACGAAATATTGCTGTTAGATGCAAGCTCTGGCGCTTATGTGTATAAAGCTTGAACCTCCCGAGGCGCTGTGTCAAGTACGAGTTTCGGCCGCATTTGGGTGCCCACTACCTTATTGGGAATACTTCCGTGAAGTAGCGTCAATCATCAGTCACGGACCTCGCCCCCCTTTGTACTTGCGTCAATGGGACTGACCGCAATGCAAGATTGATCACTCGCCTATGGGCAATGGCACATAGAGCACTGGTTTCTAATTTCGAACCGGGTCCGGCGTTGCGACTCGAGTCCGGAGCCTGGCTGTGTATTTCGACCCGTCGTCGGGGGGTGCCCTATGTCTGCCTGTTTGGAACATTCCGATCTTAAGAGTCGCTTAATCCCTGCTGATAACAGCTCACGGGGCGGTCGTCAGTCGTATGCGACCCTCCACAACACTGGAAACTTGTGAGCAACCCGTCATACCGTTGTTGGTCGGACTATCCGCTGCAATGCCCTCTCTCAGCTATTTTGTCGCGCGTTTAACAATTATGTGATGACTGAGAGCTCCCGTCCAAAGCATTCGGATGCAAAACTGTCAAAGGGCGGCTTGATGTCCTTAAAAATCATCGAAGTTATGCGCTCCATAGCCAGTTCAGACCCCTACGCCGGGAGCATTACTCTTTTGTTTAAACAGAACACCACCCGAGCACTGATAGTGTACATACTGGAAGAACTAAACTTTGTAGGGCAATCTCGACATGGGTCAGATCCGGCCGATCTTATTTTCTTCTGCTCGCCGTGACTCGGTCAGCAACGCAATAAGCATATTACCCCACACTTGACGGTTGTGGGATGTCGCGAATATCTCATTTACGACCTAACCTCAAACCCAGAAGTTGCTGGAATCCGATTAAACAACACCGAATCTTCAACCTGTTCTTTTTCTTCCGGCCATCACAGCGCTGGTGTTTTGACGAGCAGTTCCGGCAATCTCCCAGGCAGTGGCCATTGCGACTTGCATGGCAAAAGCGAAATGGTAGTTGCAGTAAGCCACGGGAAACTGCAGACCTGGCGAGTTGGAGTGGCGCGAGCCGTAGGTTACACCATCCCTAAGTGGACCACAATCGGGGTGCATTCACAAATCCACAAGTTCAATCGGGGTGGGATAGCATGGGATCGAAAACATCAAGGGCAAGCATTCCACTTGCTCCTAGGACTTGTTTGGTACGGGGCACTAGTACATCGCCTAAAGGACGGCACCTACGTTCATGCTAATATTGGACAACGTCTCTGCCTGGGCAATCGGACGGAGACGTTAGTTCGACCCGACAACTCCATTGGCCTCTGACCCTTTGCAATAAATATGAGCTCGGGCTCGATGTATCGGTTTTTAGAAGCCCGAGCAGCATACCATCATGATTGCTTGGGTCGCTCATTGTGTTCCATCGCCGAGCCGTGACACGTTTGCGCGGCGTCTCTTGATATGATCGCAACAACCCAAAAAACGTCGTCGAATCGGCGGGCCTGTCCGTTCACGGAGTGCTCCGGACAAGATGCTCAATTACATGTTGCTCTATTCCCATTGCTCGCCCAACCGCACTCTTGAATCAGTTATCGCTTCTGTGGGGTTCACTGGGTGTTGTATAACTCCAACCACTTCCCCACTCCCTTTATCCGTGAACAAAAAACGACGAAAAATAAAGTCAACGTAACCTTGTATTCGTGGCAGGGGTCCGTTGTCGGAGCGATACCATGGTCTGTTATCCACTTTTTCTACTTACGCGCAGCACGTTTAGCTGTTGAAACCTGGGGAACCGGAAAGTTAGCTTTTATGTATCCATTGCATTACGTGCTCTGGGATAATACACAAGCCATTTTGTACCGGCCGAAATCTTTAGCACCTACGTCAATCATGCCCAACCACTGCTTCGCTCTGACTGGAGCTATAATTGAACTCTTCTTGATATGATATACAGCACTACGTCAATCAGCTAATATAGCCCTTGCTGTACATCGCCAGTTATGACCGCGGAAAACTGCTGAGCCGAAAGCGACCCACCAGAAGGGGGCCACGGATGAACATACTTGGGAGCTTTGCCCTCCCGGTCGTCCGATCCGACTAGAAGGCTGGCTAAGCGTGCTCCGGAAGCAAAAAGGCCAGCGGGCGATAGCAATGACCCGGTGGCCGGAGTGGGATTCACGTGTTACACGTTTAAGTCGAGTTCTTGGGTTCTGTGGACGCTTGAAGAACCTACGATGCTTCCGTTTATTTCATCGTAGTCTCCCCAACTGGCGATTGCGACATCAGAGTCAGGATCGTTTGGTACTATTGTGGGGAGACATGTATCACCTGTATGACCGTTACTGAGGTGGTCACCAAATTTTTTGGGATTCGAACCATGCAGTACTCTACTCGGATGGGCCACCGACGTGTGTATCATTTTCGACTGAGAATGCATCTTCGATGCCACGGGATTCCCCGACATCCACAACCCGCCTCATTTGGTCTAGTAGCCTTTCCTGGTGTCCTACACCGTCGCAGGGGGTCGATGGCACGCCCGATAACGGGTGGCGATCGTCAGCGCCTCAAACTTTACGTGTGTCGAAGCACCTTTGATCGGACGTGCACTGACCTGCGGGCGGTAATTCTCGCGACTTTTCCGGAGGGGCAGATGGCATGGGAGTCGAACGCCAGGGATCGGGTTCTGTGTACGTCGTCCTGGCCCCACGTAACTATCTTCGTCCCATTCTGCGGACCGGCCGGATTTCGTGCACCGCGCAGGCGCGGATACTGAAAGTGGTTAGTGAGATTTGAATGGCATTCACGTTTTCGATCTATGGGTTCGCGAGCTTATATGCACTAAACGAAATCGAGACAGCAAAGCCATTTAAGCGAAAGGATGCGTGGTGGCTTTATGCCGGAGTTAACTCCCTCCGGTGGGAACAAATAACCGTTCTCCCGCTGCAGAATTGCCTGCTTCTTGTGTGGGAGTATCCCTTCGTACCCGAAGCGGGCGTCGGTGACGTAGGCTCTATTCCGATCTACATAGCCGAGTGTTGCCCCCACTACATACAGGTAAACATCATAGCCGGGGCAAACGGAACGGCCCTTACATTCCCGCTCCTCTCTAGCGGTGTGCGCACAGCTACCGGGCGAATTCATCGAAAGTCGTATAGCCTACTTCCCCAACAAGTTATCCTCTTGTATTGAAATCACGCACTCGCACTGGCCACGTGGTGAACTGGTGCTCGTCAGTGCACCGCCGCGGAATTCCGCGTCCCTATGACACGGTTGCGGGTATTAAGCTGGGTTCAGACTCCTGCCGCTGGAGTTCTTAAGCTGCACGTACAAGCCTCATGTCAACCGCATCACGAGTCTGCCAACCGGCTCCTCCCGGGGGTAGTTCCGCCTGTTCGTGCCCAGCTACTATTCGTTGCACGAGACCGTCCTTCTCTGGGTGCCGCCCCCATAGCATTAAGTACAGAGTTAATCCCGGTCGACCTTATAACGAAATGAGAACCACTTCTAATGGATGAGGGCGCTAGCCGACATGTTACTCACGGCAACCTTCCTGACATATATACAGGTTGGGGAGAGACGTGGCGTTTGCCAACGCAGGGGTTTGAGCTCGAGAGCCAGCACTTCGTTTGATTCGTTGAACTCTCAGCACCGGCTCGTACACGGAAAGCTCATCGAAGCTGCGCTACCAAAGGTTTTCGTGGGCCGCCCACCGGCCCATCATTGCTCTACCCTATGCTCAAACCTCGCCAGGTGGAGGTGCCCTCCAGCGATACGGTTACAGTTGTACGATCGCAAATTTCCTTATCAGGGGGCCAGGTCACCGCGATTCTACGTGTACCACGGAAGCGTTTACCTTGCTTTAATTCGGCTGTGGCTTCATGATGTGGTAATTTTGCCCTCCGCCGTGGTCATTGTTTTACCTCAGGACGGTACAAGAACAAGCGACCAGCCCTGTCCACTGTCTGGGGTTCGAAAATTTCTCCTGTTATCTACAATAATAAACAATTGACGTGAACGCGCCAATTGAACTGTGAATGTATAAAATTAGTCCACGGGACAAATCCGCAAAGCCTCCATGTACGTGCCGTCGCCGAGAACCGACGAATTGCCAATGGCGCCCCGCCAACACTAGTGTTGTCACATCCGCGCTTTATAGATGGCTCCTACCTACACTATTGGCACTCCCGATCGCAGCATGTCTCAAAGTTCGCGTCTCATCTGGCACAATATCCCGTTAATTCTTGTCAGTGTTTGCATGGTGCACGATTCCTATTGATCAACAACGCCGTACGCAATTATAGTGTATATACTGCCGGTATAAATTTACCGAACTCCTGTACTATATACTTGACCCAAGGGGGCACGTATCCCATCACCCGACCTGCGGTCAATGTTACGCTTAAATGCGCTGAGACTGTGGTGGAGAATTTCCCATATCTTGGACACCGAGTCCCTCGTGACAAGCCAACCTGTTACCTATGGGGAGTCCGCCTTCGGGGCGCGGCGATTTAG" prot = "MQTKHEQFRAQSATAGSSSHWTTSTTFGTESLADKRGLDIPTSRFAVTPVSANGHDGFEIAHSDAPRHQGAMLPNTAVKATCLFDELVTVTVNPPRRKLIYSGYLEWMNMLEHYYETVSGPKLALYIYLPCLVTGSTSCIVNSPWSLINIGALKLRFRVGNAAAPYVKRKSLLTMSCITTHPVQKESSPPSRGECKPGAGYPGGEYLNIHPGLVVRTPGSARPLYLPWDSSSTSGVANKYKCFSASNIDDNGKLVGEAGLRTPNLTPYHMLQLRSAIPICTLDKRQTSANPSSISLPAKLPRITAAICPFREQSSSWNKPPPSSLCTLWVLWKYLMGCRENPPALSADQRWNPAHGSLVSIFTWTWGHARVDLADVVDEPPMVNGLAETMPQYSNNTTYGDNLLPTSMAYDEPTWIATLMLLTGCWVLLYLSTSPHELLLVDSLVLSDLDLSTCPEKSNDGEYPCRYMKPREKDLGTVKQSFAISLRLLMSNHWMAPSPEGPKSLRSYGLFTRNWSIYSPWLAPVYGYSTFFNIHSVELPLPSKSPLSIVSSTSLLSVCLRTTTRSGWYYDWMLSLLGWQDSFNYLFQATATSDVNELCRKWIFWSLLRCYPTEDKNGNNWSPFAINPPAKKSASPVPPSFVDWDEVVTIPPDGIYSQASFGEALGLNSAPNRIRNAIPNKAERTPTSMGSISCVDDHMAEITGQQSPLNRYEVAGWYPAETWIYVKDKVNAYICLAHLFHSHPSVSYLSGPNTYRLIRSMNSRSEILWPAYIPPVGCHRSMIQGSLPDQHAHLRTCCQLLSSAARDLVLSGLGFPVNWYLHTCSHSWIPTRKNQSMWDLGTVSGRLQVGLGCSSGRGMGSYKGQEGMNAFSREGVRRQGNPRDHAREVHRNATRSIHCSERVTGTHELGCPAYIPMLRPHDLKHGNPHRCDYEGPLSPRATLSDVRWVSCRSYELMSDVGPAGVFERSTTYIVSLATGNLCPRLENIRVGRRTLVAMDMSRVMLKRDTANGCLTRAAMTDLSSWQQGIVFRSSFLTERHNDCFLGDTRLEWQEVVALPAHFSAKNNVPLTHYASQQCYYLQPCLGTDMIAVSFTDPKGRPAPGTTSLPDDRRARSGTKFDEDANSAKPDDRMPKDRSLPKLTSLGFCVTQSPYVFFVEVSEHSWCLMSYHLWPSCSLYNMGTLAIMLPERPLAGKLARPGIKCPAVREIKFRPIGCGIFVKVPFSRCARGQFSGNFVGSRWPFGYSARGDIARKWYELDPSDGTSRFPVWLHFVMSSGSHPGKCLDPQQQFEVHFAGTFSLFVVTTASYWDWGPQCNLCISECCGQELSLCRDNVIYFTETQDMWLCAGLQGAARLMTLWEQSVAESSTVPWYRWIVRCPYKPDGFLGFADHGPYMLYYRISGGASDTNSEHRGGELMSERGVNEDYSSMFDMSSCGNDQRKMRNPGSWWYYEVMPRAGVRLLRIWCSVGLCYPSGHNSIPASSDIFSPGTVQLSWTQVLLSHSPKTNWVKFISRLSDLLAQSFCMKASSASWNGGLWADKAVTAIYRMFSADLSRGIAGLTLTGNRTMNRLDLSLIHSFGATSMYCDSRFMWFPAYVVYGAVRRWTNYETSGWTFHYASVAITCANTFVGSQGMLGFSNMSLLPLDRPVASPPLYRMYERSGSKARHTPSPFDAQNDGRHHRSLTFELFSGVGLLTWFSGLQRELDQHATDLFMMAVEPMIMLLFAYMVEFLSSGLNLSIARFAEEPTRSWIMLPASCEDDLWMRSRSSEFWSHEMLLLDASSGAYVYKAWTSRGAVSSTSFGRIWVPTTLLGMLPWSSVNHQSRTSPPFVLASMGLTAMQDWSLAYGQWHMEHWFLISNRVRRCDSSPEPGCVFRPVVGGCPMSACLEHSDLKSRLIPADNSSRGGRQSYATLHNTGNLWATRHTVVGRTIRCNALSQLFCRAFNNYVMTESSRPKHSDAKLSKGGLMSLKIIEVMRSMASSDPYAGSITLLFKQNTTRALMVYMLEELNFVGQSRHGSDPADLIFFCSPWLGQQRNKHITPHLTVVGCREYLIYDLTSNPEVAGIRLNNTESSTCSFSSGHHSAGVLTSSSGNLPGSGHCDLHGKSEMVVAVSHGKLQTWRVGVARAVGYTIPKWTTIGVHSQIHKFNRGGMAWDRKHQGQAFHLLLGLVWYGALVHRLKDGTYVHANIGQRLCLGNRTETLVRPDNSIGLWPFAMNMSSGSMYRFLEARAAYHHDCLGRSLCSIAEPWHVCAASLDMIATTQKTSSNRRACPFTECSGQDAQLHVALFPLLAQPHSWISYRFCGVHWVLYNSNHFPTPFIREQKTTKNKVNVTLYSWQGSVVGAMPWSVIHFFYLRAARLAVETWGTGKLAFMYPLHYVLWDNTQAILYRPKSLAPTSIMPNHCFALTGAMIELFLMWYTALRQSANMALAVHRQLWPRKTAEPKATHQKGATDEHTWELCPPGRPIRLEGWLSVLRKQKGQRAMAMTRWPEWDSRVTRLSRVLGFCGRLKNLRCFRLFHRSLPNWRLRHQSQDRLVLLWGDMYHLYDRYWGGHQIFWDSNHAVLYSDGPPTCVSFSTENASSMPRDSPTSTTRLIWSSSLSWCPTPSQGVDGTPDNGWRSSAPQTLRVSKHLWSDVHWPAGGNSRDFSGGADGMGVERQGSGSVYVVLAPRNYLRPILRTGRISCTAQARMLKVVSEIWMAFTFSIYGFASLYALNEIETAKPFKRKDAWWLYAGVNSLRWEQMTVLPLQNCLLLVWEYPFVPEAGVGDVGSIPIYMAECCPHYMQVNIMAGANGTALTFPLLSSGVRTATGRIHRKSYSLLPQQVILLYWNHALALATWWTGARQCTAAEFRVPMTRLRVLSWVQTPAAGVLKLHVQASCQPHHESANRLLPGVVPPVRAQLLFVARDRPSLGAAPMALSTELIPVDLMTKWEPLLMDEGASRHVTHGNLPDMYTGWGETWRLPTQGFELESQHFVWFVELSAPARTRKAHRSCATKGFRGPPTGPSLLYPMLKPRQVEVPSSDTVTVVRSQISLSGGQVTAILRVPRKRLPCFNSAVASWCGNFALRRGHCFTSGRYKNKRPALSTVWGSKISPVIYNNKQLTWTRQLNCECMKLVHGTNPQSLHVRAVAENRRIANGAPPTLVLSHPRFMDGSYLHYWHSRSQHVSKFASHLAQYPVNSCQCLHGARFLLINNAVRNYSVYTAGMNLPNSCTMYLTQGGTYPITRPAVNVTLKCAETVVENFPYLGHRVPRDKPTCYLWGVRLRGAAI" for i in xrange(1, 16): if prot in translate(dna, table=i): print i break
#!/usr/bin/env python from __future__ import print_function import os from Bio.Seq import translate if __name__ == "__main__": with open(os.path.join('data', 'rosalind_ptra.txt')) as dataset: dna_string = dataset.readline().rstrip() protein_string = dataset.readline().rstrip() translation = translate(dna_string) print(translation.find(protein_string) + 1)
""" # manually taken list from CodonTable.py (Biopython source), # the list at http://www.bioinformatics.org/JaMBW/2/3/TranslationTables.html # is incomplete (Last update of the Genetic Codes: Sep 26, 1996) valid_tables = [1,2,3,4,5,6,9,10,11,12,13,14,15,16,21,22,23] """ # ok, it is better to get the valid tables list programatically # print(CodonTable.unambiguous_dna_by_id) valid_tables = [k for k, v in CodonTable.unambiguous_dna_by_id.items()] # a list of the codes possibly used for translating our dna to our protein # (yet empty) used_codes = [] # now we translate using all valid tables and check whether the resulting # protein is the same as our given master protein for t in valid_tables: # 'stop_symbol=""' and 'to_stop=False' to IGNORE STOP CODONS # 'cds=False' to ignore coding sequence checking (whether the sequence # starts with START, whether the sequence length is a multiple of three... protein = translate(dna, table=t, stop_symbol="", to_stop=False, cds=False) if protein == master_protein: used_codes.append(t) # if we had found some possible codes for our protein, print the first one # otherwise, print None if used_codes: print(used_codes[0]) else: print(None)
def count_codons(haps): import pickle from Bio.Seq import translate from operator import itemgetter from pythonlib import Alignment from pythonlib import mystats latex = False # print latex table count = [{} for i in range(102)] oh = open('all.dat', 'w') hap_freq = {} degeneracy = {} mask_mupos = [] #[10, 11, 22, 25, 32, 46, 58, 62, 67, 74, 89] mupos = [] # These sequences are HXB2 proteases wt_protease = 'PQVTLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF' wt_protease_nt = 'CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTA\ TTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTA\ TAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTT' ac_res = map(align_codons, haps) protease = wt_protease for ar in ac_res: start, residues, freq = ar # start here is human (from 1) start -= 1 # start here is pythonic (from 0) if start == None and residues == None: continue oh.write('%d %s\n' % (round(freq), wt_protease_nt[:start] + residues + wt_protease_nt[len(residues) + start:])) if start % 3 == 0: read = residues elif start % 3 == 1: read = residues[2:] elif start % 3 == 2: read = residues[1:] try: aa = translate(read) # Biopython except: print 'error: read', read continue if start % 3 == 0: start_a = start / 3 + 1 if start % 3: start_a = start / 3 + 2 stop_a = len(aa) + start_a + 1 this_hap = str(protease[:start_a - 1] + aa + protease[stop_a - 2:]) print this_hap.ljust(100), str(freq).ljust( 8 ) # this is used for resistance prediction, whole haplotype and reads for i, c in enumerate(this_hap): count[i + 1][c] = count[i + 1].get(c, 0) + freq Alignment.needle_align('asis:%s' % wt_protease, 'asis:%s ' % this_hap, 'tmp', 10.0, 0.5) d = Alignment.alignfile2dict(['tmp'], 'n', 10.0, 0.5, Verbose=False)['asis']['asis'] os.remove('tmp') mutations = [] for i, c in enumerate(zip(d.seq_a, d.seq_b)): pos = i + 1 if '-' in c: continue if c[0] != c[1]: mutations.append(c[0] + str(pos) + c[1]) if pos not in mask_mupos: mupos.append(pos) signature = ', '.join(mutations) hap_freq[signature] = hap_freq.get(signature, 0.0) + freq degeneracy[signature] = degeneracy.get(signature, 0) + 1 print '' for k, v in hap_freq.items(): print str(v).ljust(15), ' ', k mupos = sorted(mupos) spos = {} for i, j in enumerate(mupos): spos[j] = i hf_sorted = sorted(hap_freq.items(), key=itemgetter(1), reverse=True) tot_reads = sum([h[1] for h in haps]) tot_hap = sum(hap_freq.values()) print 'Tot reads after', tot_reads print 'Tot', tot_hap print 'Simpson\'s index on amino acid sequences = %f +/- %f' % mystats.Simpson( hap_freq.values()) oh = open('degeneracy.pck', 'w') pickle.dump(degeneracy, oh) oh.close() for c in count: ts = sum(c.values()) for k in c.keys(): c[k] /= ts plot_variation(count) if not latex: return hf_sorted print '' print '|c' * (1 + len(spos)) for i in mupos: print '%s%d & ' % (wt_protease[i - 1], i), print '' return hf_sorted
from Bio.Seq import translate def read_strings(fname): """ read dataset and append distinct strings """ f = open(fname,'r') return f.readlines() if __name__ == '__main__': dataset = read_strings('ptra.txt') coding_dna = dataset[0].replace('\n','') result_protein = dataset[1].replace('\n','') protein = translate(coding_dna) print protein.find(result_protein) % 3 +1 print result_protein print protein
from Bio.Seq import translate with open("rosalind_ptra.txt", "r") as f: seq = f.readline().replace("\n", "") res = f.readline().replace("\n", "") for i in range(1, 16): if translate(seq, table=i) == res + "*": print(i) break
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem from the Armory problem area, which focuses on using prebuilt bioinformatics packages, in this case BioPython. Problem Title: Finding Genes with ORFs Rosalind Armory ID: ORFR Rosalind Armory #: 015 URL: http://rosalind.info/problems/orfr/ ''' from Bio.Alphabet import IUPAC from Bio.Seq import Seq, translate from re import finditer with open('data/armory/rosalind_orfr.txt') as input_data: dna = Seq(input_data.read().strip(),IUPAC.unambiguous_dna) # Get the starting position for each ORF in the dna sequence and translate. ORFs = [translate(dna[x.start():], table = 1, stop_symbol = '', to_stop= True) for x in finditer('ATG', str(dna))] # Get the starting position for each ORF in the reverse complement sequence and translate. ORFs += [translate(dna.reverse_complement()[x.start():], table = 1, stop_symbol = '', to_stop= True) for x in finditer('ATG', str(dna.reverse_complement()))] # Find the longest ORF. longest_orf = max(map(str, ORFs), key=len) # Print and save the answer. print longest_orf with open('output/armory/Armory_015_ORFR.txt', 'w') as output_data: output_data.write(longest_orf)
def frame1(self, seq, translation_table=1): """Translate first reading frame.""" return translate(seq, table=translation_table)
codonStatus = list() codonStatusMN = list() codonStatusMS = list() snpStatus = list() nsList = list() majorCount = list() snpPosition = list() for i in range(0, aln.get_alignment_length(), 3): if i % 1000 == 0: print >> sys.stderr, i codons = aln[:, i:i + 3] codonSet = list(set([str(codon.seq) for codon in codons])) codonList.append(codonSet) aaList.append([translate(codon) for codon in codonSet]) codonCount.append(len(codonSet)) nsList.append(np.array([NSDict[str(codon.seq)] for codon in codons])) #Conserved codons if len(codonSet) == 1: codonStatus.append("C") snpStatus.extend("CCC") #Multiple codons have NS status determined by all pairs within a single mutation of each other if len(codonSet) > 2: #For each codon in the set, calculate the distance to each other codon and determine NS status if 1 snp changeSet = list() for j in range(0, len(codonSet)): aa = translate(codonSet[j]) for k in range(j, len(codonSet)):
def which_table(inp, out): for i in range(1, 7) + range(9, 17) + range(21, 24): trans = translate(inp, table=i, to_stop=True) #if len(trans) == len(out): if out == trans: print 'Match: {0:d}'.format(i)
def codonChange(codon): aa = translate(codon) ncs = nearCodons(codon) aas = [translate(nc) for nc in ncs] ns = [("N", "S")[int(naa == aa)] for naa in aas] return (ns)
def six_frame_translations(seq, genetic_code=1): """Formatted string showing the 6 frame translations and GC content. nice looking 6 frame translation with GC content - code from xbbtools similar to DNA Striders six-frame translation >>> from Bio.SeqUtils import six_frame_translations >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) GC_Frame: a:5 t:0 g:8 c:5 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC <BLANKLINE> <BLANKLINE> 1/1 G H C N G P L W P L * W A A M A I V M G R * auggccauuguaaugggccgcuga 54 % uaccgguaacauuacccggcgacu A M T I P R Q H G N Y H A A S P W Q L P G S <BLANKLINE> <BLANKLINE> """ from Bio.Seq import reverse_complement, translate anti = reverse_complement(seq) comp = anti[::-1] length = len(seq) frames = {} for i in range(0, 3): fragment_length = 3 * ((length - i) // 3) frames[i + 1] = translate(seq[i:i + fragment_length], genetic_code) frames[-(i + 1)] = translate(anti[i:i + fragment_length], genetic_code)[::-1] # create header if length > 20: short = '%s ... %s' % (seq[:10], seq[-10:]) else: short = seq header = 'GC_Frame: ' for nt in ['a', 't', 'g', 'c']: header += '%s:%d ' % (nt, seq.count(nt.upper())) header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq)) res = header for i in range(0, length, 60): subseq = seq[i:i + 60] csubseq = comp[i:i + 60] p = i // 3 res += '%d/%d\n' % (i + 1, i / 3 + 1) res += ' ' + ' '.join(frames[3][p:p + 20]) + '\n' res += ' ' + ' '.join(frames[2][p:p + 20]) + '\n' res += ' '.join(frames[1][p:p + 20]) + '\n' # seq res += subseq.lower() + '%5d %%\n' % int(GC(subseq)) res += csubseq.lower() + '\n' # - frames res += ' '.join(frames[-2][p:p + 20]) + ' \n' res += ' ' + ' '.join(frames[-1][p:p + 20]) + '\n' res += ' ' + ' '.join(frames[-3][p:p + 20]) + '\n\n' return res
def frame(self, seq, frame, translation_table=1): """Translate DNA sequence in a chosen frame.""" if frame < 0: seq = reverse_complement(seq) seq = seq[(abs(frame) - 1) :] return translate(seq, table=translation_table)
def frame1(self, seq, translation_table=1): return translate(seq, table=translation_table)