def scoreAlignmentList(alignmentList):
    scoreList = []
    queriesWithStops = []
    minScore = 0
    for al in alignmentList:
        stopCodons = 0
        queryScore = 0
        alN = gaplessStrings(al)
        ref = alN[0]
        query = alN[1]
        refAA = translate(ref)
#print(refAA)
        queryAA = translate(query)
#print(queryAA)
        for c in refAA:
            if c == '*':
                stopCodons += 1
        for c in queryAA:
            if c == '*':
                stopCodons += 1
        if stopCodons == 0:
            queryScore = sum(score_pairwise(refAA, queryAA, blosum62, -5, -1))
#print(queryScore)
            scoreList.append(queryScore)
            if minScore > queryScore:
                minScore = queryScore
        else:
            queriesWithStops.append(stopCodons)
#print("stopCodon")
#print("")
    for stopCodonCount in queriesWithStops:
        scoreList.append(stopCodonCount * minScore)
    return scoreList
Ejemplo n.º 2
0
def break_up_frame(s):
    """Returns offset, nuc, protein."""
    start = 0
    for match in re_stops.finditer(s):
        index = match.start() + 3
        if index % 3 != 0:
            continue
        n = s[start:index]
        if options.ftype == "CDS":
            offset, n, t = start_chop_and_trans(n)
        else:
            offset = 0
            t = translate(n, options.table, to_stop=True)
        if n and len(t) >= options.min_len:
            yield start + offset, n, t
        start = index
    if options.ends == "open":
        # No stop codon, Biopython's strict CDS translate will fail
        n = s[start:]
        # Ensure we have whole codons
        # TODO - Try appending N instead?
        # TODO - Do the next four lines more elegantly
        if len(n) % 3:
            n = n[:-1]
        if len(n) % 3:
            n = n[:-1]
        if options.ftype == "CDS":
            offset, n, t = start_chop_and_trans(n, strict=False)
        else:
            offset = 0
            t = translate(n, options.table, to_stop=True)
        if n and len(t) >= options.min_len:
            yield start + offset, n, t
def get_frame(geneseq, gene_HXB2, genename, VERBOSE=0):
    '''Get the frame by aligning the proteins'''
    from seqanpy import align_local
    from Bio.Seq import translate
    from numpy import argmax

    geneseq = ''.join(geneseq)
    gene_HXB2 = ''.join(gene_HXB2)

    if genename in ('tat1', 'rev1'):
        gene_HXB2 = gene_HXB2[:len(gene_HXB2) - (len(gene_HXB2) % 3)]
    elif genename in ('tat2', 'rev2'):
        gene_HXB2 = gene_HXB2[len(gene_HXB2) % 3:]

    prot_HXB2 = translate(gene_HXB2)

    scores = []
    for frame in xrange(3):
        tmp = geneseq[frame:]
        tmp = tmp[:len(tmp) - (len(tmp) % 3)]
        tmp = translate(tmp)
        (score, ali1, ali2) = align_local(prot_HXB2, tmp)
        scores.append(score)

    return argmax(scores)
Ejemplo n.º 4
0
def test(dna,AA,codeline,iAA,fAA,cds,strand):
	firstBreak = codeline.find(',')
	secondBreak = codeline.find(',',firstBreak+1)
	thirdBreak = codeline.find(',',secondBreak+1)
	fourthBreak = codeline.rfind(',')

	codonStartSite = int(codeline[firstBreak+1:secondBreak])
	initialCodon = codeline[secondBreak+1:thirdBreak]
	codonEndSite = int(codeline[thirdBreak+1:fourthBreak])
	finalCodon = codeline[fourthBreak+1:]

	TranslatableInitialCodon = initialCodon
	TranslatableFinalCodon = finalCodon
	if strand ==-1:
		TranslatableInitialCodon = reverse_complement(initialCodon)
		TranslatableFinalCodon = reverse_complement(finalCodon)

	#TEST CASES
	if AA != cds.qualifiers['translation'][0]: #protein seqs match up
		print "AA seqs not equal"
		return False 
	elif dna[codonStartSite-1:codonEndSite] != initialCodon: #codon that is being modified is where its supposed to be
		print dna[codonStartSite-1:codonEndSite] +'!=' + initialCodon + "  :so codeline doesnt match up"
		return False
	elif translate(TranslatableInitialCodon) != iAA: #starting codon is what its supposed to be
		return False
	elif translate(TranslatableFinalCodon) != fAA: #final codon is what its supposed to be
		return False
	else: 
		return True
Ejemplo n.º 5
0
def six_frame_translations(seq, genetic_code=1):
    """Formatted string showing the 6 frame translations and GC content.

    nice looking 6 frame translation with GC content - code from xbbtools
    similar to DNA Striders six-frame translation

    e.g.
    from Bio.SeqUtils import six_frame_translations
    print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
    """
    from Bio.Seq import reverse_complement, translate

    anti = reverse_complement(seq)
    comp = anti[::-1]
    length = len(seq)
    frames = {}
    for i in range(0, 3):
        frames[i + 1] = translate(seq[i:], genetic_code)
        frames[-(i + 1)] = reverse(translate(anti[i:], genetic_code))

    # create header
    if length > 20:
        short = "%s ... %s" % (seq[:10], seq[-10:])
    else:
        short = seq
    # TODO? Remove the date as this would spoil any unit test...
    date = time.strftime("%y %b %d, %X", time.localtime(time.time()))
    header = "GC_Frame: %s, " % date
    for nt in ["a", "t", "g", "c"]:
        header += "%s:%d " % (nt, seq.count(nt.upper()))

    header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % (short.lower(), length, GC(seq))
    res = header

    for i in range(0, length, 60):
        subseq = seq[i : i + 60]
        csubseq = comp[i : i + 60]
        p = i / 3
        res = res + "%d/%d\n" % (i + 1, i / 3 + 1)
        res = res + "  " + "  ".join(map(None, frames[3][p : p + 20])) + "\n"
        res = res + " " + "  ".join(map(None, frames[2][p : p + 20])) + "\n"
        res = res + "  ".join(map(None, frames[1][p : p + 20])) + "\n"
        # seq
        res = res + subseq.lower() + "%5d %%\n" % int(GC(subseq))
        res = res + csubseq.lower() + "\n"
        # - frames
        res = res + "  ".join(map(None, frames[-2][p : p + 20])) + " \n"
        res = res + " " + "  ".join(map(None, frames[-1][p : p + 20])) + "\n"
        res = res + "  " + "  ".join(map(None, frames[-3][p : p + 20])) + "\n\n"
    return res
Ejemplo n.º 6
0
def check_translation(sequence, translation, table=None) :
    if table is None :
        #Seq method:
        if translation != str(sequence.translate()) \
        or translation != str(translate(sequence)) \
        or translation != translate(str(sequence)) :
            raise ValueError("%s -> %s" % (sequence, translation))
    else:
        if translation != str(sequence.translate(table)) \
        or translation != str(translate(sequence,table)) \
        or translation != translate(str(sequence),table) :
            raise ValueError("%s -> %s (table %s)" \
                             % (sequence, translation, table))
    return True
def get_our_costs_at_Rihn(costs_Rihn, costs_ours, data, aa_mutation_rates):
    '''Get our costs at their positions'''
    c = costs_ours.set_index('pos').loc[costs_Rihn['pos']]['median']
    costs_Rihn_by_pos = costs_Rihn.set_index('pos')
    c_float = []
    c_IQD_target_specfic = []
    for (p, mut), ci in zip(costs_Rihn_by_pos.iterrows(), c):
        if ci == '<0.001':
            ci = 0
        elif ci == '>0.1':
            ci = 1
        else:
            ci = float(ci)
        c_float.append(ci)
        cons, ipos, target_aa = mut['mut'][0], int(mut['mut'][1:-1]), mut['mut'][-1]
        ipos +=714
        print(cons, mut['NL4-3'], translate(data['init_codon']['pol']['p2'][ipos]))
        c_IQD_target_specfic.append(fitness_cost_mutation('pol', data,
                                    aa_mutation_rates, ipos,
                                    target_aa, nbootstraps=100))

    c[:] = c_float
    c_IQD_target_specfic = np.array(c_IQD_target_specfic)

    comp = (pd.concat([c, costs_Rihn.set_index('pos')['cost']], axis=1)
            .rename(columns={'median': 'ours', 'cost': 'Rihn'}))

    return comp, c_IQD_target_specfic
Ejemplo n.º 8
0
def find_frame(read):
    """Frame is the one with the smallest number of stop codons
    """
    from Bio.Seq import translate
    import Bio

    # use this to cut read at multiple of three length
    rem = len(read) % 3
    last_pos = rem if rem else None
    try:
        read = read[:-last_pos]
    except TypeError:
        pass
    assert len(read) % 3 == 0, read
    read_len = len(read) - 3
    try:
        counts = [(translate(read[f : read_len + f]).count("*"), f + 1) for f in range(3)]
    except Bio.Data.CodonTable.TranslationError:
        counts = [(gap_translation(read[f:]).count("*"), f + 1) for f in range(3)]
    sor_cnt = sorted(counts)
    stop_codons, frame = sor_cnt[0]
    if stop_codons > 0:
        warnings.warn("The sequence %s contains %dstop codons" % (read, stop_codons))
    if sor_cnt[1][0] == 0:
        warnings.warn("Two frames are possible! %d and %d" % (frame, sor_cnt[1][1]))
    return frame
Ejemplo n.º 9
0
def PTRA(DNA,protein):
    index = []
    for i in [1,2,3,4,5,6,9,10,11,12,13,14,15]:
        p = translate(DNA, table=i, stop_symbol='*', to_stop=True)
        if p == protein:
            index.append(i)
    return index
Ejemplo n.º 10
0
 def get_local_codon(self, mutations, pos, mut=None):
     codon_pos = pos%3
     codon_seq = mutations[pos - codon_pos:pos + 3 - codon_pos]
     codon_seq = ''.join(map(lambda _x: _x['ref'], codon_seq))
     if mut:
         codon_seq = codon_seq[:codon_pos]+ mut + codon_seq[codon_pos + 1:]
     return translate(codon_seq)
Ejemplo n.º 11
0
def translate_with_gaps(seq):
    '''Translate sequence with gaps'''
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq, translate
    from Bio.Alphabet.IUPAC import protein

    L = len(seq)
    if L % 3:
        raise ValueError('The sequence length is not a multiple of 3')

    seqstr = ''.join(seq)
    prot = []
    for i in xrange(L // 3):
        codon = seqstr[3 * i: 3 * (i+1)]
        if codon == '---':
            prot.append('-')
        elif '-' in codon:
            raise ValueError('Non-aligned gaps found')
        else:
            prot.append(''.join(translate(codon)))
    prot = ''.join(prot)

    # Output in various formats
    if isinstance(seq, basestring):
        return prot
    elif isinstance(seq, Seq):
        return Seq(prot, protein)
    elif isinstance(seq, SeqRecord):
        return SeqRecord(Seq(prot, protein), id=seq.id, name=seq.name,
                         description=seq.description)
    else:
        import numpy as np
        return np.fromstring(prot, 'S1')
 def get_translation(self,sequence=None):
   translation = None
   seq = sequence if sequence else self.get_spliced_seq()
   if seq:
     seqlen = len(seq) / 3 * 3;
     if seqlen >= 3:
       translation = translate(seq[:seqlen])
   return translation
Ejemplo n.º 13
0
def main(filename):
    with open(filename) as fin:
        dna, prot = fin.read().strip().split('\n')
    
    res = 1
    while translate(dna, table=res, stop_symbol='') != prot:
        res += 1
    print res
Ejemplo n.º 14
0
def check_translation(sequence, translation, table=None):
    if table is None:
        t = 1
    else:
        t = table
    if translation != str(sequence.translate(t)) \
    or translation != str(translate(sequence, t)) \
    or translation != translate(str(sequence), t):
        # More details...
        for i, amino in enumerate(translation):
            codon = sequence[i * 3:i * 3 + 3]
            if amino != str(codon.translate(t)):
                raise ValueError("%s -> %s not %s (table %s)"
                         % (codon, amino, codon.translate(t), t))
        # Shouldn't reach this line:
        raise ValueError("%s -> %s (table %s)"
                         % (sequence, translation, t))
    return True
Ejemplo n.º 15
0
 def translate_read(self, read, start, end):
     trimmed_read, offset = self.trim_read(read, start, end, codon=True)
     prot_seq = translate(trimmed_read[offset:])
 
     prot_start = (read.pos + offset - start) / 3
     if prot_start < 0:
         prot_start = 0
 
     return prot_seq, prot_start
Ejemplo n.º 16
0
def start_chop_and_trans(s, strict=True):
    """Returns offset, trimmed nuc, protein."""
    if strict:
        assert s[-3:] in stops, s
    assert len(s) % 3 == 0
    for match in re_starts.finditer(s):
        # Must check the start is in frame
        start = match.start()
        if start % 3 == 0:
            n = s[start:]
            assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
            if strict:
                t = translate(n, options.table, cds=True)
            else:
                # Use when missing stop codon,
                t = "M" + translate(n[3:], options.table, to_stop=True)
            return start, n, t
    return None, None, None
Ejemplo n.º 17
0
 def frame(self, seq, frame, translation_table = 1):
     if not ((-3 <= frame <= -1) or (1 <= frame <= 3)):
         frame = 1
     if frame != 1:
         raise NotImplementedError
         #TODO - Support the frame argument
         #The old code didn't, but I can guess from
         #the code the expected 1,2,3 for the forward
         #strands and -1,-2,-3 for the reverse.
     return translate(seq, table=translation_table)
Ejemplo n.º 18
0
    def dNdS(self, reference, start, end):
        ps, pn = 0, 0
        n_codon = (end - start) / 3
        for i in range(start, end, 3):
            ref_codon = reference[i-start:i-start+3]
            ref_aa = translate(ref_codon)
            s_i, n_i = self._dNdS_sites(ref_codon)
            if s_i == 0: continue
            
            m_i = 0
            inner_s, inner_n = 0, 0
            reads = self.samfile.fetch('CONSENSUS_B_GAG_POL', start, end)
            for read in reads:
                trimmed_read = self.trim_read(read, i, i+3, codon=False)
                if len(trimmed_read) < 3: continue

                m_i += 1
                cur_pos = read.pos - i
                if cur_pos < 0: 
                    cur_pos = 0

                sij, nij = 0, 0
                for j, nt in enumerate(trimmed_read):
                    if nt == ref_codon[j]: continue
                    mut_codon = ref_codon[:j] + nt + ref_codon[j+1:]
                    if translate(mut_codon) == ref_aa:
                        sij += 1
                    else:
                        nij += 1
            
                inner_s += sij / s_i
                inner_n += nij / n_i
            
            ps += inner_s / m_i
            pn += inner_n / m_i

        ps /= float(n_codon)
        pn /= float(n_codon)

        ds = -.75 * np.log(1 - 4*ps/3)
        dn = -.75 * np.log(1 - 4*pn/3)
        print ds/dn
Ejemplo n.º 19
0
    def _dNdS_sites(self, codon):
        syn = 0
        non = 0
        alphabet = 'ACGT'
        aa = translate(codon)
        if len(codon) < 3: return syn, non

        for i in range(3):
            for mut in alphabet:
                if mut == codon[i]: continue
                mut_codon = codon[:i] + mut + codon[i+1:]

                syn_flag = (aa == translate(mut_codon))
                syn += syn_flag
                non += (not syn_flag)
        
        syn /= 3.
        non /= 3.
        assert syn + non == 3
        return syn, non
Ejemplo n.º 20
0
def transl(seq):
    out_seq = ""
    for codon in codons(seq):
        if codon == '---':
            aa = '-'
        elif codon == '...':
            aa = '.'
        else:
            aa = translate(codon)
        out_seq += aa
    return out_seq
Ejemplo n.º 21
0
def six_frames(seq, genetic_code=1):
    '''
    input a DNA sequence
    pad to a whole number of codons if required
    probably only works for ambiguous alphabet sequences
    return the six possible protein translations
    
    '''
    
    rev = reverse_complement(seq)
        
    frames = {}
    
    for i in [0,1,2]:
        l = len(seq) - i
        j = i + l - l%3
        frames['%+d'%i] = translate(seq[i:j],genetic_code)
        frames['-%d'%i] = translate(rev[i:j],genetic_code)
        
    return frames
Ejemplo n.º 22
0
def writeSTF():
    global difference, seqRecordToCheck, seqRecordToCheckComplement, variation, featureName, featureSeq, seqLength, m
    difference = len(record.seq) % 3
    seqRecordToCheck = str(record.seq)
    if difference != 0:
        seqRecordToCheck = str(record.seq)[:-difference]
    else:
        seqRecordToCheck = str(record.seq)
    seqRecordToCheckComplement = str(reverse_complement(seqRecordToCheck))
    # Reading Frames
    firstReadingFrame = translate(seqRecordToCheck)
    secondReadingFrame = translate(seqRecordToCheck[1::] + seqRecordToCheck[0])
    thirdReadingFrame = translate(seqRecordToCheck[2::] + seqRecordToCheck[0:2])
    # Reading Frames (reverseComplement)
    firstReadingFrameComplement = translate(seqRecordToCheckComplement)
    secondReadingFrameComplement = translate(seqRecordToCheckComplement[1::] + seqRecordToCheckComplement[0])
    thirdReadingFrameComplement = translate(seqRecordToCheckComplement[2::] + seqRecordToCheckComplement[0:2])
    for variation in featureStatistic_container[feature]:
        featureName = variation.note
        featureSeq = str(variation.seq)
        featureLength = len(variation.seq)
        seqLength = len(seqRecordToCheck)

        firstReadingFrameCircular = firstReadingFrame + firstReadingFrame[0:featureLength - 1]
        secondReadingFrameCircular = secondReadingFrame + secondReadingFrame[0:featureLength - 1]
        thirdReadingFrameCircular = thirdReadingFrame + thirdReadingFrame[0:featureLength - 1]

        firstReadingFrameComplementCircular = firstReadingFrameComplement + firstReadingFrameComplement[
                                                                            0:featureLength - 1]
        secondReadingFrameComplementCircular = secondReadingFrameComplement + secondReadingFrameComplement[
                                                                         0:featureLength - 1]
        thirdReadingFrameComplementCircular = thirdReadingFrameComplement + thirdReadingFrameComplement[
                                                                            0:featureLength - 1]

        # Find Matches
        firstFrameMatchesCircular = re.finditer(featureSeq, firstReadingFrameCircular)
        secondFrameMatchesCircular = re.finditer(featureSeq, secondReadingFrameCircular)
        thirdFrameMatchesCircular = re.finditer(featureSeq, thirdReadingFrameCircular)

        firstFrameComplementMatchesCircular = re.finditer(featureSeq, firstReadingFrameComplementCircular)
        secondFrameComplementMatchesCircular = re.finditer(featureSeq, secondReadingFrameComplementCircular)
        thirdFrameComplementMatchesCircular = re.finditer(featureSeq, thirdReadingFrameComplementCircular)

        for m in firstFrameMatchesCircular:
            addFeatureSTF()

        for m in secondFrameMatchesCircular:
            addFeatureSTF()

        for m in thirdFrameMatchesCircular:
            addFeatureSTF()

        for m in firstFrameComplementMatchesCircular:
            addFeatureComplSTF()

        for m in secondFrameComplementMatchesCircular:
            addFeatureComplSTF()

        for m in thirdFrameComplementMatchesCircular:
            addFeatureComplSTF()
Ejemplo n.º 23
0
def ptra():
    with open("rosalind_ptra.txt") as f:
        dna = f.readline().strip()
        prot = f.readline().strip()

    print dna
    print prot

    table = 1
    while translate(dna, table=table, to_stop=True) != prot:
        table += 1

    print table
Ejemplo n.º 24
0
def align_codon_pairwise(seqstr, refstr, **kwargs):
    '''Pairwise alignment via codons
    
    Parameters:
       **kwargs: passed down to SeqAn alignment function
    '''
    from Bio.Seq import translate
    from seqanpy import align_global
    from itertools import izip

    if len(seqstr) % 3:
        raise ValueError('The length of the first sequence is not a multiple of 3')
    elif len(refstr) % 3:
        raise ValueError('The length of the second sequence is not a multiple of 3')

    seqpr = translate(seqstr)
    refpr = translate(refstr)
    (score, alis, alir) = align_global(seqpr, refpr, **kwargs)
    aliseq = []
    aliref = []
    poss = 0
    posr = 0
    for aas, aar in izip(alis, alir):
        if aas == '-':
            aliseq.append('---')
        else:
            aliseq.append(seqstr[poss: poss+3])
            poss += 3

        if aar == '-':
            aliref.append('---')
        else:
            aliref.append(refstr[posr: posr+3])
            posr += 3

    aliseq = ''.join(aliseq)
    aliref = ''.join(aliref)

    return (aliseq, aliref)
Ejemplo n.º 25
0
def writeBothToFile(seqFileName, newSeqLength, randFileName, transFileName):
    randOutFile = open(randFileName, "w")
    randOutFile.write("> Randomly generated gene of length %d \n" % int(newSeqLength))
    transOutFile = open(transFileName, "w")
    transOutFile.write("> AA translation and untranslation of randomly generated gene of length %d\n" % int(newSeqLength))

    sequence = generateRandomGene(seqFileName, newSeqLength)
    randOutFile.write(sequence)
    seqPerm = untranslate(translate(sequence))
    transOutFile.write(seqPerm)

    randOutFile.close()
    transOutFile.close()
Ejemplo n.º 26
0
def get_protein(record, feature):
    protein = {}
    protein['identifier'] = fasta_identifier(flatten(feature.qualifiers))
    protein['description'] = mygetattr(feature.qualifiers, 'product', '')

    # prefer the annotated translation versus our own one
    if feature.qualifiers.has_key('translation'):
        # what if more than one translation?
        protein['sequence'] = feature.qualifiers['translation'].pop(0)
    else:
        protein['sequence'] = translate(get_seq_0_based(record.seq.data, feature.location.nofuzzy_start, feature.location.nofuzzy_end, feature.strand))
    if not len(protein['sequence']):
        print >>sys.stderr, "could not translate %s" % (protein['identifier'])

    return protein
Ejemplo n.º 27
0
 def test_the_translation_of_stops(self):
     """Check obj.translate() method with stop codons."""
     misc_stops = "TAATAGTGAAGAAGG"
     for nuc in [Seq(misc_stops),
                 Seq(misc_stops, generic_nucleotide),
                 Seq(misc_stops, generic_dna),
                 Seq(misc_stops, unambiguous_dna)]:
         self.assertEqual("***RR", str(nuc.translate()))
         self.assertEqual("***RR", str(nuc.translate(1)))
         self.assertEqual("***RR", str(nuc.translate("SGC0")))
         self.assertEqual("**W**", str(nuc.translate(table=2)))
         self.assertEqual("**WRR",
                          str(nuc.translate(table='Yeast Mitochondrial')))
         self.assertEqual("**WSS", str(nuc.translate(table=5)))
         self.assertEqual("**WSS", str(nuc.translate(table=9)))
         self.assertEqual("**CRR", str(nuc.translate(table='Euplotid Nuclear')))
         self.assertEqual("***RR", str(nuc.translate(table=11)))
         self.assertEqual("***RR", str(nuc.translate(table='11')))
         self.assertEqual("***RR", str(nuc.translate(table='Bacterial')))
         self.assertEqual("**GRR", str(nuc.translate(table=25)))
         self.assertEqual("", str(nuc.translate(to_stop=True)))
         self.assertEqual("O*ORR", str(nuc.translate(table=special_table)))
         self.assertEqual("*QWRR",
                          str(nuc.translate(table=Chilodonella_uncinata_table)))
         # These test the Bio.Seq.translate() function - move these?:
         self.assertEqual("*QWRR",
                          translate(str(nuc), table=Chilodonella_uncinata_table))
         self.assertEqual("O*ORR", translate(str(nuc), table=special_table))
         self.assertEqual("", translate(str(nuc), to_stop=True))
         self.assertEqual("***RR", translate(str(nuc), table='Bacterial'))
         self.assertEqual("***RR", translate(str(nuc), table='11'))
         self.assertEqual("***RR", translate(str(nuc), table=11))
         self.assertEqual("**W**", translate(str(nuc), table=2))
     self.assertEqual(str(Seq("TAT").translate()), "Y")
     self.assertEqual(str(Seq("TAR").translate()), "*")
     self.assertEqual(str(Seq("TAN").translate()), "X")
     self.assertEqual(str(Seq("NNN").translate()), "X")
     self.assertEqual(str(Seq("TAt").translate()), "Y")
     self.assertEqual(str(Seq("TaR").translate()), "*")
     self.assertEqual(str(Seq("TaN").translate()), "X")
     self.assertEqual(str(Seq("nnN").translate()), "X")
     self.assertEqual(str(Seq("tat").translate()), "Y")
     self.assertEqual(str(Seq("tar").translate()), "*")
     self.assertEqual(str(Seq("tan").translate()), "X")
     self.assertEqual(str(Seq("nnn").translate()), "X")
Ejemplo n.º 28
0
    def translations(self):
        """
        Yield all six translations of a nucleotide sequence.

        @return: A generator that produces six L{TranslatedRead} instances.
        """
        rc = self.reverseComplement().sequence
        for reverseComplemented in False, True:
            for frame in 0, 1, 2:
                seq = rc if reverseComplemented else self.sequence
                # Get the suffix of the sequence for translation. I.e.,
                # skip 0, 1, or 2 initial bases, depending on the frame.
                # Note that this makes a copy of the sequence, which we can
                # then safely append 'N' bases to to adjust its length to
                # be zero mod 3.
                suffix = seq[frame:]
                lengthMod3 = len(suffix) % 3
                if lengthMod3:
                    suffix += ('NN' if lengthMod3 == 1 else 'N')
                yield TranslatedRead(self, translate(suffix), frame,
                                     reverseComplemented)
Ejemplo n.º 29
0
def apply_operation():
    """Do the selected operation."""
    codon_table = codon_list.get(codon_list.curselection())
    print('Code: {}'.format(codon_table))

    seq = ''.join(input_text.get(1.0, tk.END).split())
    print('Input sequence: {}'.format(seq))

    operation = transform_var.get()
    print('Operation: {}'.format(operation))

    if operation == 'transcribe':
        result = transcribe(seq)
    elif operation == 'translate':
        result = translate(seq, table=codon_table, to_stop=True)
    elif operation == 'back transcribe':
        result = back_transcribe(seq)
    else:
        result = ''

    output_text.delete(1.0, tk.END)
    output_text.insert(tk.END, result)
    print('Result: {}'.format(result))
    return
def process_seq(header, seq):
  hits = 0
  id = header
  if id.count(" ") > 0: id = id[:id.index(" ")]
  seq = Seq(seq)
  # direction1 is the direction we originally have had, 2 is the antisense strand
  # then TRANSLATE ALL POSSIBLE ORFs, do not stop at STOP codons
  dna_sequence_direction1 = seq
  dna_sequence_direction2 = dna_sequence_direction1.reverse_complement()
  translations = {}
  translations['+1'] = translate(dna_sequence_direction1)
  translations['-1'] = translate(dna_sequence_direction2)
  translations['+2'] = translate(dna_sequence_direction1[1:])
  translations['-2'] = translate(dna_sequence_direction2[1:])
  translations['+3'] = translate(dna_sequence_direction1[2:])
  translations['-3'] = translate(dna_sequence_direction2[2:])
  # get all polypeptides between stops, filter out those shorter than minlength
  polypeptides = {}
  for frame, translation in translations.iteritems():
    peptides = translation.split('*')
    if int(frame) < 0: startpos = len(seq) +1 + int(frame)
    else: startpos = int(frame)
    #print >> sys.stderr, "frame: %s | startpos: %s | scaffold length: %s" %(frame, startpos, len(seq))
    #print >> sys.stderr, "# peptides: %s | pep.length: %s | transformed length: %s | scaffold length: %s" %(len(peptides), sum([len(pep) for pep in peptides]), (sum([len(pep) for pep in peptides])+len(peptides))*3, len(seq))
    for peptide in peptides:
      peptide += '*'
      if int(frame) < 0: stoppos = startpos +1 - (3*len(peptide))
      else: stoppos = startpos -1 + (3*len(peptide))
      polypeptides[str(startpos)+':'+str(stoppos)] = peptide.tostring()
      if int(frame) < 0: startpos = stoppos-1
      else: startpos = stoppos+1

  for key, pepseq in polypeptides.iteritems():
    if len(pepseq) < args['minlength']: continue
    startpos, stoppos = [int(e) for e in key.split(":")]
    hits += 1
    print ">%s[%s:%s]" %( id, startpos, stoppos )
    print pepseq
  return hits
def translate_dna_prot(dna,prot):
    for i in ncbi_ids:
        if translate(dna, stop_symbol="",table=i) == prot:
            return i
Ejemplo n.º 32
0
 def frame(self, seq, frame, translation_table=1):
     if frame < 0:
         seq = reverse_complement(seq)
     seq = seq[(abs(frame) - 1):]
     return translate(seq, table=translation_table)
Ejemplo n.º 33
0
# if we want to transcribe from the template strand (3' -> 5'):
transcribe(template_dna.reverse_complement())

# transcribing back to DNA:
from Bio.Seq import Seq, back_transcribe
back_transcribe(
    messenger_rna)  # just changes U -> T and gives the coding strand

# 3.8 Translation  (mRNA -> Protein)
# Uses standard genetic code
from Bio.Seq import Seq, translate
from Bio.Alphabet import IUPAC
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
                    IUPAC.unambiguous_rna)
translate(messenger_rna)

# Direct translation (DNA -> Protein
from Bio.Seq import Seq, translate
from Bio.Alphabet import IUPAC
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
                 IUPAC.unambiguous_dna)
translate(coding_dna)

# we can specify other translation tables by name
translate(coding_dna, table="Vertebrate Mitochondrial")
# or by NCBI number
translate(coding_dna, table=2)

# 3.9 Transcription and Translation
Ejemplo n.º 34
0
def six_frame_translations(seq, genetic_code=1):
    """Return pretty string showing the 6 frame translations and GC content.

    Nice looking 6 frame translation with GC content - code from xbbtools
    similar to DNA Striders six-frame translation

    >>> from Bio.SeqUtils import six_frame_translations
    >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA"))
    GC_Frame: a:5 t:0 g:8 c:5
    Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC
    <BLANKLINE>
    <BLANKLINE>
    1/1
      G  H  C  N  G  P  L
     W  P  L  *  W  A  A
    M  A  I  V  M  G  R  *
    auggccauuguaaugggccgcuga   54 %
    uaccgguaacauuacccggcgacu
    A  M  T  I  P  R  Q
     H  G  N  Y  H  A  A  S
      P  W  Q  L  P  G  S
    <BLANKLINE>
    <BLANKLINE>

    """  # noqa for pep8 W291 trailing whitespace
    from Bio.Seq import reverse_complement, reverse_complement_rna, translate

    if "u" in seq.lower():
        anti = reverse_complement_rna(seq)
    else:
        anti = reverse_complement(seq, inplace=False)  # TODO: remove inplace=False
    comp = anti[::-1]
    length = len(seq)
    frames = {}
    for i in range(0, 3):
        fragment_length = 3 * ((length - i) // 3)
        frames[i + 1] = translate(seq[i : i + fragment_length], genetic_code)
        frames[-(i + 1)] = translate(anti[i : i + fragment_length], genetic_code)[::-1]

    # create header
    if length > 20:
        short = "%s ... %s" % (seq[:10], seq[-10:])
    else:
        short = seq
    header = "GC_Frame:"
    for nt in ["a", "t", "g", "c"]:
        header += " %s:%d" % (nt, seq.count(nt.upper()))

    header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % (
        short.lower(),
        length,
        GC(seq),
    )
    res = header

    for i in range(0, length, 60):
        subseq = seq[i : i + 60]
        csubseq = comp[i : i + 60]
        p = i // 3
        res += "%d/%d\n" % (i + 1, i / 3 + 1)
        res += "  " + "  ".join(frames[3][p : p + 20]) + "\n"
        res += " " + "  ".join(frames[2][p : p + 20]) + "\n"
        res += "  ".join(frames[1][p : p + 20]) + "\n"
        # seq
        res += subseq.lower() + "%5d %%\n" % int(GC(subseq))
        res += csubseq.lower() + "\n"
        # - frames
        res += "  ".join(frames[-2][p : p + 20]) + "\n"
        res += " " + "  ".join(frames[-1][p : p + 20]) + "\n"
        res += "  " + "  ".join(frames[-3][p : p + 20]) + "\n\n"
    return res
Ejemplo n.º 35
0
def dna_to_amino_acid(dna_seq):
    return translate(dna_seq)
Ejemplo n.º 36
0
from Bio.Seq import translate
with open("rosalind_prot.txt") as p:
    myfile = p.read()
print(translate(myfile, stop_symbol=""))
Ejemplo n.º 37
0
def translation(DNA):
    return translate(DNA, to_stop=True)
Ejemplo n.º 38
0
import warnings

from Bio.Alphabet import IUPAC
from Bio.Seq import Seq, translate

warnings.filterwarnings('ignore')

with open('rosalind_orfr.txt', 'r') as f:
    seq_text = f.read().strip()
seq = Seq(seq_text, IUPAC.unambiguous_dna)

DNAs = [seq, seq.reverse_complement()]
longest_protein = ''

for i in range(3):
    for DNA in DNAs:
        protein = translate(DNA[i:], to_stop=True)
        if len(longest_protein) < len(protein):
            longest_protein = protein

print(longest_protein)
Ejemplo n.º 39
0
from Bio.Seq import translate

DNA_seq=open("rosalind_orfr.txt","r").read().rstrip()

rc_seq = DNA_seq.replace("A", 't').replace("C", 'g').replace("G", 'c').replace("T", 'a')[::-1].upper()


forward_orf_ind =[i for i in range(len(DNA_seq)-3+1) if DNA_seq[i:i+3] == 'ATG']

reverse_orf_ind =[i for i in range(len(rc_seq)-3+1) if rc_seq[i:i+3] == 'ATG']


list_of_AASeq = []

for idx in forward_orf_ind:
    list_of_AASeq.append(translate(DNA_seq[idx:], to_stop=True))
for idx in reverse_orf_ind:
    list_of_AASeq.append(translate(rc_seq[idx:], to_stop=True))



print max(list_of_AASeq, key=len)
Ejemplo n.º 40
0
    def check_emboss_translate(self, sequence, table=None, frame=None):
        """Call transeq, returns protein sequence as string."""
        # TODO - Support transeq in Bio.Emboss.Applications?
        # (doesn't seem worthwhile as Biopython can do translations)

        # Setup,
        cline = exes["transeq"]

        if len(sequence) < 100:
            filename = None
            cline += " -sequence asis:%s" % sequence
        else:
            # There are limits on command line string lengths...
            # use a temp file instead.
            filename = "Emboss/temp_transeq.txt"
            SeqIO.write(SeqRecord(sequence, id="Test"), filename, "fasta")
            cline += " -sequence %s" % filename

        cline += " -auto"  # no prompting
        cline += " -filter"  # use stdout
        if table is not None:
            cline += " -table %s" % str(table)
        if frame is not None:
            cline += " -frame %s" % str(frame)
        # Run the tool,
        child = subprocess.Popen(
            str(cline),
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True,
            shell=(sys.platform != "win32"),
        )
        out, err = child.communicate()

        msg = "cline='%s'" % cline
        # Check no error output:
        self.assertEqual(err, "", msg=msg)

        # Check we could read its output
        record = SeqIO.read(StringIO(out), "fasta")

        result = child.wait()
        self.assertEqual(result, 0, msg=msg)

        if filename:
            os.remove(filename)
            self.assertTrue(record.id.startswith("Test"), msg=msg)
        else:
            self.assertTrue(record.id.startswith("asis"), msg=msg)

        translation = record.seq
        if table is None:
            table = 1
        self.assertEqual(translation, sequence.translate(table))
        self.assertEqual(translation, translate(sequence, table))
        self.assertEqual(translation, translate(str(sequence), table))
        # More details...
        for i, amino in enumerate(translation):
            codon = sequence[i * 3 : i * 3 + 3]
            msg = "codon %s, table %s" % (codon, table)
            self.assertEqual(amino, codon.translate(table), msg=msg)
Ejemplo n.º 41
0
def dna2prot(seq):
    """Translate DNA sequence to protein sequence"""
    return translate(Seq(seq)).tostring()
Ejemplo n.º 42
0
def collect_ccds_record(listObject, data_dict, rev=True):
    orderCCDS = dict()
    record_with_frame = dict()
    record_original = dict()
    new_gene_list = dict()
    for geneName in listObject:
        record_with_frame[geneName] = list()
        record_original[geneName] = list()

        try:
            ccds_object = data_dict[geneName]
        except KeyError:
            continue

        if rev == True:
            ccds_positions = sorted([(int(x.split("-")[0]),
                                      int(x.split("-")[1]))
                                     for x in ccds_object["pos"]])[::-1]
        else:
            ccds_positions = sorted([(int(x.split("-")[0]),
                                      int(x.split("-")[1]))
                                     for x in ccds_object["pos"]])

        orderCCDS[geneName] = ccds_positions

        remaining = 0
        first_flag = False

        for seq_coord in ccds_positions:

            flagThing = False

            while flagThing != True:
                try:
                    handle_in = Entrez.efetch(db="nucleotide",
                                              id=ccds_object["id"],
                                              rettype="fasta",
                                              strand=+1,
                                              seq_start=seq_coord[0] + 1,
                                              seq_stop=seq_coord[1] + 1)

                    record = SeqIO.read(handle_in, "fasta")
                    flagThing = True

                except (IOError, httplib.HTTPException):
                    continue

            if first_flag == False:
                if len(record.seq) % 3 != 0:
                    if rev == True:
                        sequenceObj = record.seq.reverse_complement() + Seq(
                            "N" * (3 - len(record.seq) % 3), generic_dna)
                        record_original[geneName].append(
                            [record.id, str(sequenceObj)])
                    else:
                        sequenceObj = record.seq + Seq(
                            "N" * (3 - len(record.seq) % 3), generic_dna)
                        record_original[geneName].append(
                            [record.id, str(sequenceObj)])

                    inseq = translate(sequenceObj)
                    if "*" in inseq:
                        new_gene_list.append(geneName)
                        break

                    remaining = len(record.seq) % 3
                    record.seq = inseq
                else:
                    if rev == True:
                        sequenceObj = record.seq.reverse_complement()
                        record_original[geneName].append(
                            [record.id, str(sequenceObj)])
                    else:
                        sequenceObj = record.seq
                        record_original[geneName].append(
                            [record.id, str(sequenceObj)])

                    remaining = len(record.seq) % 3
                    record.seq = translate(sequenceObj)

                first_flag = True

            elif first_flag == True:
                if remaining != 0:
                    if rev == True:
                        sequenceObj = Seq(
                            "N" * (remaining),
                            generic_dna) + record.seq.reverse_complement()
                    else:
                        sequenceObj = Seq("N" * (remaining),
                                          generic_dna) + record.seq

                    remaining = len(sequenceObj) % 3

                    if len(sequenceObj) % 3 != 0:
                        sequenceObj = sequenceObj + Seq(
                            "N" * (3 - len(sequenceObj) % 3), generic_dna)

                    record_original[geneName].append(
                        [record.id, str(sequenceObj)])
                    inseq = translate(sequenceObj)
                    record.seq = inseq

                elif len(record.seq) % 3 != 0:
                    if rev == True:
                        sequenceObj = record.seq.reverse_complement() + Seq(
                            "N" * (3 - len(record.seq) % 3), generic_dna)
                    else:
                        sequenceObj = record.seq + Seq(
                            "N" * (3 - len(record.seq) % 3), generic_dna)

                    record_original[geneName].append(
                        [record.id, str(sequenceObj)])
                    inseq = translate(sequenceObj)
                    remaining = len(record.seq) % 3
                    record.seq = inseq
                else:
                    if rev == True:
                        sequenceObj = record.seq.reverse_complement()
                    else:
                        sequenceObj = record.seq

                    record_original[geneName].append(
                        [record.id, str(sequenceObj)])

                    remaining = len(sequenceObj) % 3
                    record.seq = translate(sequenceObj)

            if record.seq.count("*") > 1:
                new_gene_list.append(geneName)
                break

            print(geneName, record.id, record.seq)
            record_with_frame[geneName].append(record)
            handle_in.close()

    return record_with_frame, set(new_gene_list), orderCCDS, record_original
from Bio import SeqIO
from Bio.Seq import Seq, transcribe, translate
import sys

file_out = 'gene_seq_out.fasta'

if __name__ == '__main__':
    # filenames_list = list of entered arguments from command line.
    file_in = sys.argv[:1]

#Open output file and write
with open(file_out, 'w') as f_out:
    #as file_in is a list, could have more than one file so we need to iterate.
    for filename in file_in:
        #my_sequence_recorded_iterable is a list, each sequence of the file will have a sequence record,
        #the record is composed by seq, id, name and description.
        my_sequence_recorded_iterable = list(
            SeqIO.parse(open(filename, mode='r'), 'fasta'))
        # if the length of the list is empty print a warning and write to stderr.
        if len(my_sequence_recorded_iterable) == 0:
            print(f'WARNING file: {filename}: Empty file', file=sys.stderr)
        #now we will iterate on my_sequence_recorded_iterable list, with seq_record elements
        for seq_record in my_sequence_recorded_iterable:
            #seq_record.description=' '.join(seq_record.description.split()[1:])  in here I edit the description
            # as it originally contains the id.
            # we now write each iterated sequence 'seq_record' in our output file f_out as fasta format.
            SeqIO.write(seq_record, f_out, 'fasta')
            #We now print to stdoout.
            print('>' + seq_record.id)
            print(translate(seq_record.seq))
Ejemplo n.º 44
0
#!/usr/bin/python

'''
translate nucleotide sequences into protein sequences 
'''

import sys,argparse
import rjvbio.seq
import Bio.SeqIO,Bio.SeqRecord
from Bio.Seq import translate, Seq

ap = argparse.ArgumentParser(description=__doc__,formatter_class=argparse.ArgumentDefaultsHelpFormatter)
ap.add_argument('--inp',nargs='+',required=True,type=str,help='input sequence file(s)')
ap.add_argument('--inpformat',default='fasta',type=str,help='format of input file(s), eg fasta,fastq,genbank see http://biopython.org/wiki/SeqIO#File_Formats for details')
ap.add_argument('--out',default='STDOUT',type=str,help='output FASTA file')
conf = ap.parse_args()

if conf.out == 'STDOUT':
    fout = sys.stdout
else:
    fout = open(conf.out,'wb')
    
for fname in conf.inp:
    for rec in Bio.SeqIO.parse(fname,conf.inpformat):
        seq = translate(rec.seq)
        newrec = Bio.SeqRecord.SeqRecord(seq, id=rec.id,description='')
        Bio.SeqIO.write(newrec,fout,"fasta")
            
if conf.out != 'STDOUT': fout.close()
Ejemplo n.º 45
0
# (at your option) any later version.

# This software is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>

from Bio.Seq import translate
import sys

with open('c:/Users/Weka/Downloads/rosalind_ptra(2).txt') as f:
    i = 0
    for line in f:
        print(line.strip())
        if i == 0:
            coding_dna = line.strip()
        if i == 1:
            out = line.strip()
            for table in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15]:
                print(table)
                translated = translate(coding_dna, table=table, to_stop=True)
                #                if len(translated)==len(out):
                if translated == out:
                    print(table, translated)
                    sys.exit()
                print('Not matched: ' + translated)
        i += 1
Ejemplo n.º 46
0
 def translate(self, codon_table):
     seq = "".join(self.src_text.GetValue().split())  # remove whitespace
     print(seq)
     self.dest_text.Clear()
     self.dest_text.SetValue(translate(seq, table=codon_table,
                                       to_stop=True))
from Bio.Seq import translate
dna = "ATGCAGACCAAACACGAGCAGTTCCGTGCGCAATCCGCGACCGCCGGATCTTCTAGTCACTGGACCACCTCGACCACCTTCGGAACCGAGTCTCTGGCGGATAAGCGTGGGCTGGATATCCCGACGTCCCGATTCGCAGTTACACCAGTATCCGCCAACGGACACGATGGTTTTGAGATCGCTCACTCCGACGCTCCTCGCCACCAAGGCGCAATATTGCCGAACACAGCCGTGAAAGCAACCTGCCTTTTTGATGAGCTAGTTACAGTCACTGTGAATCCGCCGCGCCGAAAGCTGATTTATTCCGGTTACCTAGAATGGATGAATATACTCGAACATTATTATGAAACAGTATCTGGTCCTAAGTTGGCGTTGTATATTTATCTGCCCTGTCTCGTCACTGGATCAACTTCCTGTATCGTTAACTCTCCTTGATCTCTTATCAATATTGGGGCTCTAAAGCTACGTTTTCGAGTGGGAAACGCTGCGGCTCCTTATGTGAAGCGCAAAAGTCTTTTAACGATAAGTTGCATCACGACACACCCGGTGCAGAAAGAATCTTCCCCACCCTCCCGTGGTGAATGTAAACCTGGGGCCGGTTATCCCGGTGGTGAATACTTGAATATCCATCCTGGATTAGTGGTACGCACTCCTGGTAGCGCCCGTCCCTTATACTTACCGTGAGATTCTTCGTCGACCTCTGGCGTCGCCAATAAGTACAAATGCTTCAGTGCTTCTAACATTGACGATAACGGAAAACTTGTCGGAGAGGCGGGGTTACGGACACCGAACTTAACGCCTTACCATATGTTACAATTACGATCGGCCATCCCAATCTGCACACTCGATAAACGCCAAACGAGTGCAAACCCGAGTAGCATTAGCCTTCCCGCTAAACTGCCGCGAATCACCGCCGCCATCTGCCCCTTCCGGGAGCAATCTTCCTCATGAAATAAACCACCGCCGTCGAGTCTATGCACACTATGAGTCCTCTGGAAGTACCTGATGGGCTGCCGGGAAAATCCCCCCGCGTTGAGTGCGGATCAGCGTTGAAATCCGGCTCACGGCTCACTGGTGAGTATCTTCACATGAACATGGGGTCACGCTCGTGTAGACCTGGCGGACGTCGTGGACGAGCCACCTATAGTCAATGGGCTAGCAGAAACCATACCTCAATATTCAAACAACACGACGTACGGGGATAATCTACTGCCGACGAGTATGGCGTACGATGAACCTACCTGAATCGCCACGCTCATATTACTTACGGGGTGTTGAGTCCTCCTTTACTTGAGCACTAGCCCACACGAGTTGTTGTTAGTCGATTCCCTCGTTCTCAGTGACTTAGACTTGTCCACATGCCCTGAGAAGAGCAACGACGGTGAATACCCTTGCCGCTATATAAAGCCGCGGGAAAAAGACCTGGGCACCGTTAAACAGTCGTTTGCAATTTCTTTACGCCTATTAATGTCAAACCATTGGATGGCCCCGTCGCCGGAGGGGCCGAAAAGTTTGCGGTCTTATGGCCTCTTTACTCGGAACTGGAGTATTTATTCACCCTGACTTGCGCCTGTATATGGCTATTCCACATTCTTTAATATTCATTCGGTCGAGCTTCCGTTGCCCTCTAAGTCACCACTGTCCATCGTGTCCTCAACTAGCCTCTTATCCGTCTGTCTACGTACTACAACCCGTTCGGGGTGATACTATGATTGGATACTTTCACTGCTAGGCTGGCAGGATTCGTTCAACTATTTGTTCCAGGCCACGGCGACGTCCGACGTAAACGAGCTTTGTCGAAAATGAATCTTTTGGTCTTTACTACGATGCTACCCAACAGAAGATAAGAATGGGAACAATTGAAGTCCATTTGCCATTAATCCTCCGGCTAAAAAATCGGCGAGTCCCGTACCCCCCTCATTTGTAGACTGAGACGAGGTTGTCACCATTCCACCTGATGGCATCTACTCCCAAGCCTCATTTGGGGAGGCTCTGGGACTTAATAGCGCGCCTAACCGAATTCGCAACGCTATTCCCAATAAGGCGGAGCGAACACCTACCTCGATAGGATCCATTAGCTGTGTAGATGACCACATGGCGGAGATTACGGGACAACAGAGCCCCTTAAATCGGTATGAAGTAGCAGGTTGATACCCTGCTGAAACTTGAATCTATGTAAAGGACAAGGTTAATGCTTACATCTGTTTGGCTCACCTGTTCCATAGTCATCCTTCGGTATCTTATCTCAGCGGGCCTAACACATACCGTCTCATCCGGTCAATAAATTCGCGGTCAGAAATTCTATGGCCCGCTTATATCCCCCCGGTCGGTTGCCACCGCAGCATGATTCAAGGATCACTTCCCGATCAACACGCACATCTGCGTACGTGCTGTCAACTACTAAGTAGTGCTGCACGCGACCTGGTTTTGAGTGGCCTAGGGTTTCCGGTTAACTGATATTTACACACATGCAGTCATTCTTGGATCCCCACGCGAAAGAATCAGAGTATGTGGGATCTAGGTACGGTGTCGGGTCGACTACAAGTGGGCCTAGGGTGCTCATCAGGCCGGGGTATGGGTTCCTACAAAGGGCAGGAGGGCATGAACGCATTTTCACGTGAGGGTGTGCGCCGACAGGGAAATCCGCGAGATCATGCGCGTGAAGTTCACCGGAACGCGACACGATCAATCCACTGTAGCGAACGTGTGACGGGTACTCACGAACTCGGTTGCCCCGCGTACATCCCTATACTCCGGCCCCATGATTTGAAACATGGCAATCCACACCGCTGCGATTACGAAGGACCCCTTAGTCCTCGTGCAACCCTGAGCGACGTGCGCTGAGTAAGCTGCCGCAGCTATGAATTAATATCTGATGTGGGACCCGCGGGAGTGTTTGAGCGATCGACCACGTATATTGTGTCACTGGCCACCGGGAACCTCTGTCCACGACTTGAAAACATTCGGGTTGGGCGCCGAACTCTAGTTGCCATGGATATGTCGCGGGTTATATTAAAACGCGACACAGCTAACGGGTGCCTCACACGCGCTGCTATGACTGACTTGAGCTCTTGACAACAAGGAATCGTATTCCGGTCATCTTTCTTGACAGAGCGACACAACGACTGCTTCCTCGGTGACACCCGGCTTGAGTGGCAGGAGGTCGTCGCACTACCTGCTCATTTTTCCGCTAAAAACAATGTACCGCTGACACACTACGCTAGTCAACAGTGCTATTATTTACAGCCCTGCTTGGGGACCGATATAATTGCTGTCAGTTTCACCGATCCAAAGGGGCGTCCAGCACCGGGAACTACGAGCCTCCCAGACGACCGTCGGGCCCGCTCTGGCACTAAGTTCGATGAGGATGCCAATTCCGCCAAACCGGACGACCGCATACCCAAGGATCGTTCTTTACCTAAGCTAACCTCACTGGGGTTCTGCGTGACTCAATCACCATATGTTTTTTTTGTTGAAGTGTCAGAACATTCCTGGTGCTTGATGTCTTACCATTTGTGACCCTCGTGTTCTTTGTATAACATGGGCACGTTAGCAATTATGCTTCCCGAACGACCTCTAGCCGGAAAGCTCGCTCGACCCGGAATTAAATGTCCGGCGGTCCGGGAAATCAAGTTTCGTCCAATCGGTTGCGGGATCTTTGTAAAGGTCCCCTTCTCTCGATGTGCGCGTGGTCAATTCTCGGGGAACTTCGTCGGGTCCCGATGGCCGTTCGGATATTCGGCCCGAGGCGACATCGCGCGTAAATGATATGAGTTGGACCCGTCGGACGGCACGTCGCGCTTCCCAGTGTGACTGCACTTTGTCATAAGCAGTGGCTCTCACCCGGGAAAGTGTTTGGACCCGCAACAGCAGTTTGAAGTACACTTCGCCGGGACGTTTTCTTTATTCGTGGTTACCACAGCGAGCTACTGAGACTGGGGGCCTCAATGCAATCTATGTATCAGTGAATGTTGTGGCCAAGAGCTCTCCTTATGTCGTGATAACGTTATTTATTTCACGGAAACACAAGACATGTGGCTGTGTGCTGGATTACAGGGGGCTGCTCGCCTCATAACACTTTGAGAGCAGTCTGTAGCCGAGAGCAGTACAGTTCCCTGATATCGGTGAATTGTCCGATGTCCCTATAAACCAGACGGATTTTTGGGGTTTGCAGACCACGGTCCATACATACTATATTATCGCATTTCCGGGGGCGCCAGCGATACTAACTCGGAACATCGTGGAGGGGAACTCATGTCAGAGCGAGGCGTTAACGAAGACTATTCATCGATATTCGATATGAGTTCGTGTGGGAACGACCAGCGCAAGATGCGCAATCCGGGATCCTGATGGTACTATGAAGTCATACCTCGTGCAGGTGTCCGTCTTTTGCGTATCTGGTGCAGCGTAGGGCTATGTTATCCCTCGGGACATAATAGTATCCCAGCTAGTAGTGACATCTTTAGCCCCGGGACTGTGCAATTATCATGAACTCAAGTTCTACTCTCTCACTCTCCAAAGACCAATTGGGTAAAATTTATTTCCCGCCTGTCAGACTTGTTGGCTCAAAGCTTCTGCATAAAGGCAAGCAGCGCATCATGAAATGGCGGCCTATGGGCGGACAAGGCGGTTACTGCCATCTATCGTATGTTTAGCGCTGACCTGTCGCGAGGCATTGCTGGATTAACTTTAACAGGAAATCGAACCATAAATCGGTTGGATCTTTCGTTGATCCACAGCTTCGGGGCCACTTCCATGTACTGCGACTCACGTTTTATGTGATTTCCTGCTTACGTGGTATATGGCGCCGTCCGGCGATGAACTAACTACGAGACTAGCGGTTGAACATTCCACTACGCTTCGGTAGCCATTACTTGTGCTAATACCTTTGTCGGATCTCAAGGTATACTCGGTTTCTCCAATATATCATTGCTGCCGCTCGACCGGCCCGTGGCGTCCCCACCCCTATACCGGATGTATGAGCGTTCTGGGAGTAAAGCGCGCCACACACCGAGCCCATTCGACGCGCAAAATGACGGACGTCATCATCGGTCACTTACCTTCGAGTTGTTTTCGGGAGTTGGCTTATTAACTTGATTCAGTGGGTTACAGCGGGAACTAGACCAGCACGCCACTGACCTATTCATAATAGCTGTGGAACCGATGATCATACTGTTGTTTGCTTATATGGTAGAATTTCTGTCCAGTGGGCTGAATCTAAGCATTGCGCGATTTGCTGAAGAGCCCACCCGGTCTTGAATTATGCTACCCGCCTCCTGCGAGGATGATCTATGAATGCGATCGCGTAGCTCAGAATTTTGATCCCACGAAATATTGCTGTTAGATGCAAGCTCTGGCGCTTATGTGTATAAAGCTTGAACCTCCCGAGGCGCTGTGTCAAGTACGAGTTTCGGCCGCATTTGGGTGCCCACTACCTTATTGGGAATACTTCCGTGAAGTAGCGTCAATCATCAGTCACGGACCTCGCCCCCCTTTGTACTTGCGTCAATGGGACTGACCGCAATGCAAGATTGATCACTCGCCTATGGGCAATGGCACATAGAGCACTGGTTTCTAATTTCGAACCGGGTCCGGCGTTGCGACTCGAGTCCGGAGCCTGGCTGTGTATTTCGACCCGTCGTCGGGGGGTGCCCTATGTCTGCCTGTTTGGAACATTCCGATCTTAAGAGTCGCTTAATCCCTGCTGATAACAGCTCACGGGGCGGTCGTCAGTCGTATGCGACCCTCCACAACACTGGAAACTTGTGAGCAACCCGTCATACCGTTGTTGGTCGGACTATCCGCTGCAATGCCCTCTCTCAGCTATTTTGTCGCGCGTTTAACAATTATGTGATGACTGAGAGCTCCCGTCCAAAGCATTCGGATGCAAAACTGTCAAAGGGCGGCTTGATGTCCTTAAAAATCATCGAAGTTATGCGCTCCATAGCCAGTTCAGACCCCTACGCCGGGAGCATTACTCTTTTGTTTAAACAGAACACCACCCGAGCACTGATAGTGTACATACTGGAAGAACTAAACTTTGTAGGGCAATCTCGACATGGGTCAGATCCGGCCGATCTTATTTTCTTCTGCTCGCCGTGACTCGGTCAGCAACGCAATAAGCATATTACCCCACACTTGACGGTTGTGGGATGTCGCGAATATCTCATTTACGACCTAACCTCAAACCCAGAAGTTGCTGGAATCCGATTAAACAACACCGAATCTTCAACCTGTTCTTTTTCTTCCGGCCATCACAGCGCTGGTGTTTTGACGAGCAGTTCCGGCAATCTCCCAGGCAGTGGCCATTGCGACTTGCATGGCAAAAGCGAAATGGTAGTTGCAGTAAGCCACGGGAAACTGCAGACCTGGCGAGTTGGAGTGGCGCGAGCCGTAGGTTACACCATCCCTAAGTGGACCACAATCGGGGTGCATTCACAAATCCACAAGTTCAATCGGGGTGGGATAGCATGGGATCGAAAACATCAAGGGCAAGCATTCCACTTGCTCCTAGGACTTGTTTGGTACGGGGCACTAGTACATCGCCTAAAGGACGGCACCTACGTTCATGCTAATATTGGACAACGTCTCTGCCTGGGCAATCGGACGGAGACGTTAGTTCGACCCGACAACTCCATTGGCCTCTGACCCTTTGCAATAAATATGAGCTCGGGCTCGATGTATCGGTTTTTAGAAGCCCGAGCAGCATACCATCATGATTGCTTGGGTCGCTCATTGTGTTCCATCGCCGAGCCGTGACACGTTTGCGCGGCGTCTCTTGATATGATCGCAACAACCCAAAAAACGTCGTCGAATCGGCGGGCCTGTCCGTTCACGGAGTGCTCCGGACAAGATGCTCAATTACATGTTGCTCTATTCCCATTGCTCGCCCAACCGCACTCTTGAATCAGTTATCGCTTCTGTGGGGTTCACTGGGTGTTGTATAACTCCAACCACTTCCCCACTCCCTTTATCCGTGAACAAAAAACGACGAAAAATAAAGTCAACGTAACCTTGTATTCGTGGCAGGGGTCCGTTGTCGGAGCGATACCATGGTCTGTTATCCACTTTTTCTACTTACGCGCAGCACGTTTAGCTGTTGAAACCTGGGGAACCGGAAAGTTAGCTTTTATGTATCCATTGCATTACGTGCTCTGGGATAATACACAAGCCATTTTGTACCGGCCGAAATCTTTAGCACCTACGTCAATCATGCCCAACCACTGCTTCGCTCTGACTGGAGCTATAATTGAACTCTTCTTGATATGATATACAGCACTACGTCAATCAGCTAATATAGCCCTTGCTGTACATCGCCAGTTATGACCGCGGAAAACTGCTGAGCCGAAAGCGACCCACCAGAAGGGGGCCACGGATGAACATACTTGGGAGCTTTGCCCTCCCGGTCGTCCGATCCGACTAGAAGGCTGGCTAAGCGTGCTCCGGAAGCAAAAAGGCCAGCGGGCGATAGCAATGACCCGGTGGCCGGAGTGGGATTCACGTGTTACACGTTTAAGTCGAGTTCTTGGGTTCTGTGGACGCTTGAAGAACCTACGATGCTTCCGTTTATTTCATCGTAGTCTCCCCAACTGGCGATTGCGACATCAGAGTCAGGATCGTTTGGTACTATTGTGGGGAGACATGTATCACCTGTATGACCGTTACTGAGGTGGTCACCAAATTTTTTGGGATTCGAACCATGCAGTACTCTACTCGGATGGGCCACCGACGTGTGTATCATTTTCGACTGAGAATGCATCTTCGATGCCACGGGATTCCCCGACATCCACAACCCGCCTCATTTGGTCTAGTAGCCTTTCCTGGTGTCCTACACCGTCGCAGGGGGTCGATGGCACGCCCGATAACGGGTGGCGATCGTCAGCGCCTCAAACTTTACGTGTGTCGAAGCACCTTTGATCGGACGTGCACTGACCTGCGGGCGGTAATTCTCGCGACTTTTCCGGAGGGGCAGATGGCATGGGAGTCGAACGCCAGGGATCGGGTTCTGTGTACGTCGTCCTGGCCCCACGTAACTATCTTCGTCCCATTCTGCGGACCGGCCGGATTTCGTGCACCGCGCAGGCGCGGATACTGAAAGTGGTTAGTGAGATTTGAATGGCATTCACGTTTTCGATCTATGGGTTCGCGAGCTTATATGCACTAAACGAAATCGAGACAGCAAAGCCATTTAAGCGAAAGGATGCGTGGTGGCTTTATGCCGGAGTTAACTCCCTCCGGTGGGAACAAATAACCGTTCTCCCGCTGCAGAATTGCCTGCTTCTTGTGTGGGAGTATCCCTTCGTACCCGAAGCGGGCGTCGGTGACGTAGGCTCTATTCCGATCTACATAGCCGAGTGTTGCCCCCACTACATACAGGTAAACATCATAGCCGGGGCAAACGGAACGGCCCTTACATTCCCGCTCCTCTCTAGCGGTGTGCGCACAGCTACCGGGCGAATTCATCGAAAGTCGTATAGCCTACTTCCCCAACAAGTTATCCTCTTGTATTGAAATCACGCACTCGCACTGGCCACGTGGTGAACTGGTGCTCGTCAGTGCACCGCCGCGGAATTCCGCGTCCCTATGACACGGTTGCGGGTATTAAGCTGGGTTCAGACTCCTGCCGCTGGAGTTCTTAAGCTGCACGTACAAGCCTCATGTCAACCGCATCACGAGTCTGCCAACCGGCTCCTCCCGGGGGTAGTTCCGCCTGTTCGTGCCCAGCTACTATTCGTTGCACGAGACCGTCCTTCTCTGGGTGCCGCCCCCATAGCATTAAGTACAGAGTTAATCCCGGTCGACCTTATAACGAAATGAGAACCACTTCTAATGGATGAGGGCGCTAGCCGACATGTTACTCACGGCAACCTTCCTGACATATATACAGGTTGGGGAGAGACGTGGCGTTTGCCAACGCAGGGGTTTGAGCTCGAGAGCCAGCACTTCGTTTGATTCGTTGAACTCTCAGCACCGGCTCGTACACGGAAAGCTCATCGAAGCTGCGCTACCAAAGGTTTTCGTGGGCCGCCCACCGGCCCATCATTGCTCTACCCTATGCTCAAACCTCGCCAGGTGGAGGTGCCCTCCAGCGATACGGTTACAGTTGTACGATCGCAAATTTCCTTATCAGGGGGCCAGGTCACCGCGATTCTACGTGTACCACGGAAGCGTTTACCTTGCTTTAATTCGGCTGTGGCTTCATGATGTGGTAATTTTGCCCTCCGCCGTGGTCATTGTTTTACCTCAGGACGGTACAAGAACAAGCGACCAGCCCTGTCCACTGTCTGGGGTTCGAAAATTTCTCCTGTTATCTACAATAATAAACAATTGACGTGAACGCGCCAATTGAACTGTGAATGTATAAAATTAGTCCACGGGACAAATCCGCAAAGCCTCCATGTACGTGCCGTCGCCGAGAACCGACGAATTGCCAATGGCGCCCCGCCAACACTAGTGTTGTCACATCCGCGCTTTATAGATGGCTCCTACCTACACTATTGGCACTCCCGATCGCAGCATGTCTCAAAGTTCGCGTCTCATCTGGCACAATATCCCGTTAATTCTTGTCAGTGTTTGCATGGTGCACGATTCCTATTGATCAACAACGCCGTACGCAATTATAGTGTATATACTGCCGGTATAAATTTACCGAACTCCTGTACTATATACTTGACCCAAGGGGGCACGTATCCCATCACCCGACCTGCGGTCAATGTTACGCTTAAATGCGCTGAGACTGTGGTGGAGAATTTCCCATATCTTGGACACCGAGTCCCTCGTGACAAGCCAACCTGTTACCTATGGGGAGTCCGCCTTCGGGGCGCGGCGATTTAG"
prot = "MQTKHEQFRAQSATAGSSSHWTTSTTFGTESLADKRGLDIPTSRFAVTPVSANGHDGFEIAHSDAPRHQGAMLPNTAVKATCLFDELVTVTVNPPRRKLIYSGYLEWMNMLEHYYETVSGPKLALYIYLPCLVTGSTSCIVNSPWSLINIGALKLRFRVGNAAAPYVKRKSLLTMSCITTHPVQKESSPPSRGECKPGAGYPGGEYLNIHPGLVVRTPGSARPLYLPWDSSSTSGVANKYKCFSASNIDDNGKLVGEAGLRTPNLTPYHMLQLRSAIPICTLDKRQTSANPSSISLPAKLPRITAAICPFREQSSSWNKPPPSSLCTLWVLWKYLMGCRENPPALSADQRWNPAHGSLVSIFTWTWGHARVDLADVVDEPPMVNGLAETMPQYSNNTTYGDNLLPTSMAYDEPTWIATLMLLTGCWVLLYLSTSPHELLLVDSLVLSDLDLSTCPEKSNDGEYPCRYMKPREKDLGTVKQSFAISLRLLMSNHWMAPSPEGPKSLRSYGLFTRNWSIYSPWLAPVYGYSTFFNIHSVELPLPSKSPLSIVSSTSLLSVCLRTTTRSGWYYDWMLSLLGWQDSFNYLFQATATSDVNELCRKWIFWSLLRCYPTEDKNGNNWSPFAINPPAKKSASPVPPSFVDWDEVVTIPPDGIYSQASFGEALGLNSAPNRIRNAIPNKAERTPTSMGSISCVDDHMAEITGQQSPLNRYEVAGWYPAETWIYVKDKVNAYICLAHLFHSHPSVSYLSGPNTYRLIRSMNSRSEILWPAYIPPVGCHRSMIQGSLPDQHAHLRTCCQLLSSAARDLVLSGLGFPVNWYLHTCSHSWIPTRKNQSMWDLGTVSGRLQVGLGCSSGRGMGSYKGQEGMNAFSREGVRRQGNPRDHAREVHRNATRSIHCSERVTGTHELGCPAYIPMLRPHDLKHGNPHRCDYEGPLSPRATLSDVRWVSCRSYELMSDVGPAGVFERSTTYIVSLATGNLCPRLENIRVGRRTLVAMDMSRVMLKRDTANGCLTRAAMTDLSSWQQGIVFRSSFLTERHNDCFLGDTRLEWQEVVALPAHFSAKNNVPLTHYASQQCYYLQPCLGTDMIAVSFTDPKGRPAPGTTSLPDDRRARSGTKFDEDANSAKPDDRMPKDRSLPKLTSLGFCVTQSPYVFFVEVSEHSWCLMSYHLWPSCSLYNMGTLAIMLPERPLAGKLARPGIKCPAVREIKFRPIGCGIFVKVPFSRCARGQFSGNFVGSRWPFGYSARGDIARKWYELDPSDGTSRFPVWLHFVMSSGSHPGKCLDPQQQFEVHFAGTFSLFVVTTASYWDWGPQCNLCISECCGQELSLCRDNVIYFTETQDMWLCAGLQGAARLMTLWEQSVAESSTVPWYRWIVRCPYKPDGFLGFADHGPYMLYYRISGGASDTNSEHRGGELMSERGVNEDYSSMFDMSSCGNDQRKMRNPGSWWYYEVMPRAGVRLLRIWCSVGLCYPSGHNSIPASSDIFSPGTVQLSWTQVLLSHSPKTNWVKFISRLSDLLAQSFCMKASSASWNGGLWADKAVTAIYRMFSADLSRGIAGLTLTGNRTMNRLDLSLIHSFGATSMYCDSRFMWFPAYVVYGAVRRWTNYETSGWTFHYASVAITCANTFVGSQGMLGFSNMSLLPLDRPVASPPLYRMYERSGSKARHTPSPFDAQNDGRHHRSLTFELFSGVGLLTWFSGLQRELDQHATDLFMMAVEPMIMLLFAYMVEFLSSGLNLSIARFAEEPTRSWIMLPASCEDDLWMRSRSSEFWSHEMLLLDASSGAYVYKAWTSRGAVSSTSFGRIWVPTTLLGMLPWSSVNHQSRTSPPFVLASMGLTAMQDWSLAYGQWHMEHWFLISNRVRRCDSSPEPGCVFRPVVGGCPMSACLEHSDLKSRLIPADNSSRGGRQSYATLHNTGNLWATRHTVVGRTIRCNALSQLFCRAFNNYVMTESSRPKHSDAKLSKGGLMSLKIIEVMRSMASSDPYAGSITLLFKQNTTRALMVYMLEELNFVGQSRHGSDPADLIFFCSPWLGQQRNKHITPHLTVVGCREYLIYDLTSNPEVAGIRLNNTESSTCSFSSGHHSAGVLTSSSGNLPGSGHCDLHGKSEMVVAVSHGKLQTWRVGVARAVGYTIPKWTTIGVHSQIHKFNRGGMAWDRKHQGQAFHLLLGLVWYGALVHRLKDGTYVHANIGQRLCLGNRTETLVRPDNSIGLWPFAMNMSSGSMYRFLEARAAYHHDCLGRSLCSIAEPWHVCAASLDMIATTQKTSSNRRACPFTECSGQDAQLHVALFPLLAQPHSWISYRFCGVHWVLYNSNHFPTPFIREQKTTKNKVNVTLYSWQGSVVGAMPWSVIHFFYLRAARLAVETWGTGKLAFMYPLHYVLWDNTQAILYRPKSLAPTSIMPNHCFALTGAMIELFLMWYTALRQSANMALAVHRQLWPRKTAEPKATHQKGATDEHTWELCPPGRPIRLEGWLSVLRKQKGQRAMAMTRWPEWDSRVTRLSRVLGFCGRLKNLRCFRLFHRSLPNWRLRHQSQDRLVLLWGDMYHLYDRYWGGHQIFWDSNHAVLYSDGPPTCVSFSTENASSMPRDSPTSTTRLIWSSSLSWCPTPSQGVDGTPDNGWRSSAPQTLRVSKHLWSDVHWPAGGNSRDFSGGADGMGVERQGSGSVYVVLAPRNYLRPILRTGRISCTAQARMLKVVSEIWMAFTFSIYGFASLYALNEIETAKPFKRKDAWWLYAGVNSLRWEQMTVLPLQNCLLLVWEYPFVPEAGVGDVGSIPIYMAECCPHYMQVNIMAGANGTALTFPLLSSGVRTATGRIHRKSYSLLPQQVILLYWNHALALATWWTGARQCTAAEFRVPMTRLRVLSWVQTPAAGVLKLHVQASCQPHHESANRLLPGVVPPVRAQLLFVARDRPSLGAAPMALSTELIPVDLMTKWEPLLMDEGASRHVTHGNLPDMYTGWGETWRLPTQGFELESQHFVWFVELSAPARTRKAHRSCATKGFRGPPTGPSLLYPMLKPRQVEVPSSDTVTVVRSQISLSGGQVTAILRVPRKRLPCFNSAVASWCGNFALRRGHCFTSGRYKNKRPALSTVWGSKISPVIYNNKQLTWTRQLNCECMKLVHGTNPQSLHVRAVAENRRIANGAPPTLVLSHPRFMDGSYLHYWHSRSQHVSKFASHLAQYPVNSCQCLHGARFLLINNAVRNYSVYTAGMNLPNSCTMYLTQGGTYPITRPAVNVTLKCAETVVENFPYLGHRVPRDKPTCYLWGVRLRGAAI"
for i in xrange(1, 16):
    if prot in translate(dna, table=i):
        print i
        break
Ejemplo n.º 48
0
#!/usr/bin/env python

from __future__ import print_function
import os

from Bio.Seq import translate

if __name__ == "__main__":
    with open(os.path.join('data', 'rosalind_ptra.txt')) as dataset:
        dna_string = dataset.readline().rstrip()
        protein_string = dataset.readline().rstrip()

    translation = translate(dna_string)
    print(translation.find(protein_string) + 1)
Ejemplo n.º 49
0
"""
# manually taken list from CodonTable.py (Biopython source),
# the list at http://www.bioinformatics.org/JaMBW/2/3/TranslationTables.html
# is incomplete (Last update of the Genetic Codes: Sep 26, 1996)
valid_tables = [1,2,3,4,5,6,9,10,11,12,13,14,15,16,21,22,23]
"""
# ok, it is better to get the valid tables list programatically
# print(CodonTable.unambiguous_dna_by_id)
valid_tables = [k for k, v in CodonTable.unambiguous_dna_by_id.items()]

# a list of the codes possibly used for translating our dna to our protein
# (yet empty)
used_codes = []

# now we translate using all valid tables and check whether the resulting
# protein is the same as our given master protein
for t in valid_tables:
    # 'stop_symbol=""' and 'to_stop=False' to IGNORE STOP CODONS
    # 'cds=False' to ignore coding sequence checking (whether the sequence
    # starts with START, whether the sequence length is a multiple of three...
    protein = translate(dna, table=t, stop_symbol="", to_stop=False, cds=False)
    if protein == master_protein:
        used_codes.append(t)

# if we had found some possible codes for our protein, print the first one
# otherwise, print None
if used_codes:
    print(used_codes[0])
else:
    print(None)
Ejemplo n.º 50
0
def count_codons(haps):

    import pickle
    from Bio.Seq import translate
    from operator import itemgetter
    from pythonlib import Alignment
    from pythonlib import mystats

    latex = False  # print latex table
    count = [{} for i in range(102)]
    oh = open('all.dat', 'w')
    hap_freq = {}
    degeneracy = {}
    mask_mupos = []  #[10, 11, 22, 25, 32, 46, 58, 62, 67, 74, 89]
    mupos = []
    # These sequences are HXB2 proteases
    wt_protease = 'PQVTLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF'
    wt_protease_nt = 'CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTA\
TTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTA\
TAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTT'

    ac_res = map(align_codons, haps)

    protease = wt_protease
    for ar in ac_res:
        start, residues, freq = ar  # start here is human (from 1)
        start -= 1  # start here is pythonic (from 0)
        if start == None and residues == None: continue

        oh.write('%d %s\n' % (round(freq), wt_protease_nt[:start] + residues +
                              wt_protease_nt[len(residues) + start:]))

        if start % 3 == 0:
            read = residues
        elif start % 3 == 1:
            read = residues[2:]
        elif start % 3 == 2:
            read = residues[1:]
        try:
            aa = translate(read)  # Biopython
        except:
            print 'error: read', read
            continue

        if start % 3 == 0:
            start_a = start / 3 + 1
        if start % 3:
            start_a = start / 3 + 2

        stop_a = len(aa) + start_a + 1

        this_hap = str(protease[:start_a - 1] + aa + protease[stop_a - 2:])

        print this_hap.ljust(100), str(freq).ljust(
            8
        )  # this is used for resistance prediction, whole haplotype and reads
        for i, c in enumerate(this_hap):
            count[i + 1][c] = count[i + 1].get(c, 0) + freq
        Alignment.needle_align('asis:%s' % wt_protease, 'asis:%s ' % this_hap,
                               'tmp', 10.0, 0.5)
        d = Alignment.alignfile2dict(['tmp'], 'n', 10.0, 0.5,
                                     Verbose=False)['asis']['asis']
        os.remove('tmp')

        mutations = []

        for i, c in enumerate(zip(d.seq_a, d.seq_b)):
            pos = i + 1
            if '-' in c:
                continue
            if c[0] != c[1]:
                mutations.append(c[0] + str(pos) + c[1])
                if pos not in mask_mupos: mupos.append(pos)
        signature = ', '.join(mutations)
        hap_freq[signature] = hap_freq.get(signature, 0.0) + freq
        degeneracy[signature] = degeneracy.get(signature, 0) + 1
    print ''
    for k, v in hap_freq.items():
        print str(v).ljust(15), ' ', k
    mupos = sorted(mupos)
    spos = {}
    for i, j in enumerate(mupos):
        spos[j] = i

    hf_sorted = sorted(hap_freq.items(), key=itemgetter(1), reverse=True)
    tot_reads = sum([h[1] for h in haps])
    tot_hap = sum(hap_freq.values())

    print 'Tot reads after', tot_reads
    print 'Tot', tot_hap
    print 'Simpson\'s index on amino acid sequences = %f +/- %f' % mystats.Simpson(
        hap_freq.values())
    oh = open('degeneracy.pck', 'w')
    pickle.dump(degeneracy, oh)
    oh.close()

    for c in count:
        ts = sum(c.values())
        for k in c.keys():
            c[k] /= ts
    plot_variation(count)
    if not latex:
        return hf_sorted
    print ''
    print '|c' * (1 + len(spos))
    for i in mupos:
        print '%s%d & ' % (wt_protease[i - 1], i),
    print ''

    return hf_sorted
Ejemplo n.º 51
0
from Bio.Seq import translate

def read_strings(fname):
	""" read dataset and append distinct strings """
	f = open(fname,'r')
	return  f.readlines()

	
if __name__ == '__main__':
	dataset = read_strings('ptra.txt')
	coding_dna = dataset[0].replace('\n','')
	result_protein = dataset[1].replace('\n','')

	protein = translate(coding_dna)
	print protein.find(result_protein) % 3 +1
	print result_protein
	print protein
Ejemplo n.º 52
0
from Bio.Seq import translate
with open("rosalind_ptra.txt", "r") as f:
    seq = f.readline().replace("\n", "")
    res = f.readline().replace("\n", "")
for i in range(1, 16):
    if translate(seq, table=i) == res + "*":
        print(i)
        break
Ejemplo n.º 53
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem from the Armory problem area, 
which focuses on using prebuilt bioinformatics packages, in this case BioPython.

Problem Title: Finding Genes with ORFs
Rosalind Armory ID: ORFR
Rosalind Armory #: 015
URL: http://rosalind.info/problems/orfr/
'''

from Bio.Alphabet import IUPAC
from Bio.Seq import Seq, translate
from re import finditer

with open('data/armory/rosalind_orfr.txt') as input_data:
	dna = Seq(input_data.read().strip(),IUPAC.unambiguous_dna)

# Get the starting position for each ORF in the dna sequence and translate.
ORFs = [translate(dna[x.start():], table = 1, stop_symbol = '', to_stop= True) for x in finditer('ATG', str(dna))]
# Get the starting position for each ORF in the reverse complement sequence and translate.
ORFs += [translate(dna.reverse_complement()[x.start():], table = 1, stop_symbol = '', to_stop= True) for x in finditer('ATG', str(dna.reverse_complement()))]

# Find the longest ORF.
longest_orf = max(map(str, ORFs), key=len)

# Print and save the answer.
print longest_orf
with open('output/armory/Armory_015_ORFR.txt', 'w') as output_data:
	output_data.write(longest_orf)
 def frame1(self, seq, translation_table=1):
     """Translate first reading frame."""
     return translate(seq, table=translation_table)
Ejemplo n.º 55
0
codonStatus = list()
codonStatusMN = list()
codonStatusMS = list()
snpStatus = list()
nsList = list()
majorCount = list()
snpPosition = list()

for i in range(0, aln.get_alignment_length(), 3):
    if i % 1000 == 0:
        print >> sys.stderr, i

    codons = aln[:, i:i + 3]
    codonSet = list(set([str(codon.seq) for codon in codons]))
    codonList.append(codonSet)
    aaList.append([translate(codon) for codon in codonSet])
    codonCount.append(len(codonSet))
    nsList.append(np.array([NSDict[str(codon.seq)] for codon in codons]))

    #Conserved codons
    if len(codonSet) == 1:
        codonStatus.append("C")
        snpStatus.extend("CCC")

    #Multiple codons have NS status determined by all pairs within a single mutation of each other
    if len(codonSet) > 2:
        #For each codon in the set, calculate the distance to each other codon and determine NS status if 1 snp
        changeSet = list()
        for j in range(0, len(codonSet)):
            aa = translate(codonSet[j])
            for k in range(j, len(codonSet)):
Ejemplo n.º 56
0
def which_table(inp, out):
    for i in range(1, 7) + range(9, 17) + range(21, 24):
        trans = translate(inp, table=i, to_stop=True)
        #if len(trans) == len(out):
        if out == trans:
            print 'Match: {0:d}'.format(i)
Ejemplo n.º 57
0
def codonChange(codon):
    aa = translate(codon)
    ncs = nearCodons(codon)
    aas = [translate(nc) for nc in ncs]
    ns = [("N", "S")[int(naa == aa)] for naa in aas]
    return (ns)
Ejemplo n.º 58
0
def six_frame_translations(seq, genetic_code=1):
    """Formatted string showing the 6 frame translations and GC content.

    nice looking 6 frame translation with GC content - code from xbbtools
    similar to DNA Striders six-frame translation

    >>> from Bio.SeqUtils import six_frame_translations
    >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA"))
    GC_Frame: a:5 t:0 g:8 c:5 
    Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC
    <BLANKLINE>
    <BLANKLINE>
    1/1
      G  H  C  N  G  P  L
     W  P  L  *  W  A  A
    M  A  I  V  M  G  R  *
    auggccauuguaaugggccgcuga   54 %
    uaccgguaacauuacccggcgacu
    A  M  T  I  P  R  Q 
     H  G  N  Y  H  A  A  S
      P  W  Q  L  P  G  S
    <BLANKLINE>
    <BLANKLINE>

    """
    from Bio.Seq import reverse_complement, translate
    anti = reverse_complement(seq)
    comp = anti[::-1]
    length = len(seq)
    frames = {}
    for i in range(0, 3):
        fragment_length = 3 * ((length - i) // 3)
        frames[i + 1] = translate(seq[i:i + fragment_length], genetic_code)
        frames[-(i + 1)] = translate(anti[i:i + fragment_length],
                                     genetic_code)[::-1]

    # create header
    if length > 20:
        short = '%s ... %s' % (seq[:10], seq[-10:])
    else:
        short = seq
    header = 'GC_Frame: '
    for nt in ['a', 't', 'g', 'c']:
        header += '%s:%d ' % (nt, seq.count(nt.upper()))

    header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),
                                                           length, GC(seq))
    res = header

    for i in range(0, length, 60):
        subseq = seq[i:i + 60]
        csubseq = comp[i:i + 60]
        p = i // 3
        res += '%d/%d\n' % (i + 1, i / 3 + 1)
        res += '  ' + '  '.join(frames[3][p:p + 20]) + '\n'
        res += ' ' + '  '.join(frames[2][p:p + 20]) + '\n'
        res += '  '.join(frames[1][p:p + 20]) + '\n'
        # seq
        res += subseq.lower() + '%5d %%\n' % int(GC(subseq))
        res += csubseq.lower() + '\n'
        # - frames
        res += '  '.join(frames[-2][p:p + 20]) + ' \n'
        res += ' ' + '  '.join(frames[-1][p:p + 20]) + '\n'
        res += '  ' + '  '.join(frames[-3][p:p + 20]) + '\n\n'
    return res
 def frame(self, seq, frame, translation_table=1):
     """Translate DNA sequence in a chosen frame."""
     if frame < 0:
         seq = reverse_complement(seq)
     seq = seq[(abs(frame) - 1) :]
     return translate(seq, table=translation_table)
Ejemplo n.º 60
0
 def frame1(self, seq, translation_table=1):
     return translate(seq, table=translation_table)