Esempio n. 1
0
def ReadingFrameFinder(DNASTRING):
    CleanDNA = DNASTRING.rstrip("\n")
    OpenLocations = []
    CloseLocations = []
    stringlen = len(CleanDNA)
    TtoU = CleanDNA.replace("T", 'U')
    readingframeRange = xrange(0, stringlen)
    PossibleGenes = []
    for item in readingframeRange:
        if TtoU[item:item+3] == "AUG":
            Newthing = xrange(item, stringlen, 3)
            storage = item
            for number in Newthing:
                if TtoU[number:number+3] == "UAA" or TtoU[number:number+3] == "UAG" or TtoU[number:number+3] == "UGA":
                    PossibleGenes.append(TtoU[storage:number+3])
                    break
    for Seqeu in PossibleGenes:
        if len(Seqeu) % 3 == 0:
            LETGO = Seq(Seqeu, generic_rna)
            FinalizedProt.append(str(LETGO.translate()))
        else:
            Removal_Len = len(Seqeu) % 3
            UpdatedSequence = Seqeu[:-Removal_Len]
            ETGO2 = Seq(UpdatedSequence, generic_rna)
            FinalizedProt.append(str(ETGO2.translate()))
Esempio n. 2
0
def itercodon(seq, frame, offset, table, reverse=False):
    stop = 0
    if not reverse:
        for i in xrange(frame, len(seq) - offset, 3):
            subseq = str(seq.seq)[i:i + 3]
            assert (len(subseq) % 3 == 0), (str(seq))
            aa = Seq.translate(subseq, table)
            yield i, aa
        if i + 3 != len(seq):
            subseq = seq[i + 3:] + "N" * (3 - offset)
            assert (len(subseq) % 3 == 0)
            aa = Seq.translate(subseq, table)
            yield i, aa
    else:
        for i in xrange(len(seq), offset, -3):
            # the reverse complement
            subseq = Seq.reverse_complement(str(seq.seq)[i - 3:i])
            assert (len(subseq) % 3 == 0)
            aa = Seq.translate(subseq, table)
            yield i, aa
        if offset:
            subseq = Seq.reverse_complement("N" * (3 - offset) +
                                            str(seq.seq)[:offset])
            assert (len(subseq) % 3 == 0)
            aa = Seq.translate(subseq, table)
            yield i, aa
Esempio n. 3
0
def aa_table(codonfile, outfile):
    """
    """
    codons = pd.read_csv(codonfile, sep="\t", index_col="hxb2").fillna("")
    subtables = []

    for region in ranges:
        subtable = pd.DataFrame(columns=["region", "position", "coverage"] +
                                aa_header,
                                index=ranges[region]).fillna(0)
        subtable["region"] = region
        subtable["position"] = np.arange(1, len(subtable) + 1)
        for hxb2 in subtable.index:
            if hxb2 in codons.index:
                rows = codons.loc[[hxb2]]
                for codon, count in zip(rows["codon"], rows["count"]):
                    if codon == "":
                        subtable.loc[hxb2, "del"] += count
                    elif len(codon) > 3:
                        subtable.loc[hxb2, "ins"] += count
                        aa = str(Seq.translate(codon[:3]))
                        subtable.loc[hxb2, aa] += count
                    else:
                        aa = str(Seq.translate(codon))
                        subtable.loc[hxb2, aa] += count
        subtable["coverage"] = subtable[aa_header].sum(axis=1)
        subtables.append(subtable)

    pd.concat(subtables).to_excel(outfile, index_label="hxb2")
Esempio n. 4
0
def translate(filename, out=sys.stdout, log=sys.stderr):
    """
    Translate nucleotide sequences in FASTA file `filename` to all six possible
    frames.

    Write amino acid sequences to FASTA file `out`, with the frame number
    appended to the sequence header.

    Log summary statistics to file `log`.
    """

    nskipped = 0

    for n, record in enumerate(SeqIO.parse(filename, "fasta")):

        seq = str(record.seq)

        if 'N' in seq:
            nskipped += 1
            continue

        for i in range(3):
            j = 3 * ((len(seq) - i) // 3) + i
            print(">%s-%d" % (record.id, i), file=out)
            print(Seq.translate(seq[i:j]), file=out)

        seq = str(record.seq.reverse_complement())

        for i in range(3):
            j = 3 * ((len(seq) - i) // 3) + i
            print(">%s-%d'" % (record.id, i), file=out)
            print(Seq.translate(seq[i:j]), file=out)

    print("nreads", n, file=log)
    print("nskipped (N)", nskipped, file=log)
Esempio n. 5
0
 def test_stops(self):
     for nucleotide_seq in [
             self.misc_stops,
             Seq.Seq(self.misc_stops),
             Seq.Seq(self.misc_stops, Alphabet.generic_nucleotide),
             Seq.Seq(self.misc_stops, Alphabet.DNAAlphabet()),
             Seq.Seq(self.misc_stops, IUPAC.unambiguous_dna),
     ]:
         self.assertEqual("***RR", str(Seq.translate(nucleotide_seq)))
         self.assertEqual("***RR",
                          str(Seq.translate(nucleotide_seq, table=1)))
         self.assertEqual("***RR",
                          str(Seq.translate(nucleotide_seq, table="SGC0")))
         self.assertEqual("**W**",
                          str(Seq.translate(nucleotide_seq, table=2)))
         self.assertEqual(
             "**WRR",
             str(Seq.translate(nucleotide_seq,
                               table="Yeast Mitochondrial")))
         self.assertEqual("**WSS",
                          str(Seq.translate(nucleotide_seq, table=5)))
         self.assertEqual("**WSS",
                          str(Seq.translate(nucleotide_seq, table=9)))
         self.assertEqual(
             "**CRR",
             str(Seq.translate(nucleotide_seq, table="Euplotid Nuclear")))
         self.assertEqual("***RR",
                          str(Seq.translate(nucleotide_seq, table=11)))
         self.assertEqual(
             "***RR", str(Seq.translate(nucleotide_seq, table="Bacterial")))
Esempio n. 6
0
 def add_translations(self):
     '''
     translate the nucleotide sequence into the proteins specified
     in self.proteins. these are expected to be SeqFeatures
     '''
     from Bio import Seq
     for node in self.tree.find_clades(order='preorder'):
         if not hasattr(node, "translations"):
             node.translations = {}
             node.aa_mutations = {}
         if node.up is None:
             for prot in self.proteins:
                 node.translations[prot] = Seq.translate(
                     str(self.proteins[prot].extract(
                         Seq.Seq("".join(node.sequence)))).replace(
                             '-', 'N'))
                 node.aa_mutations[prot] = []
         else:
             for prot in self.proteins:
                 node.translations[prot] = Seq.translate(
                     str(self.proteins[prot].extract(
                         Seq.Seq("".join(node.sequence)))).replace(
                             '-', 'N'))
                 node.aa_mutations[prot] = [
                     (a, pos, d) for pos, (a, d) in enumerate(
                         zip(node.up.translations[prot],
                             node.translations[prot])) if a != d
                 ]
     self.dump_attr.append('translations')
Esempio n. 7
0
def assign_fitness(nodes):
	'''
	loops over all viruses, translates their sequences and calculates the virus fitness
	'''
	aa, sites, wt_aa, aa_prob = load_mutational_tolerance()
	aln = AlignIO.read('source-data/H1_H3.fasta', 'fasta')
	# returns true whenever either of the sequences have a gap
	aligned = (np.array(aln)!='-').min(axis=0)
	# map alignment positions to sequence positions, subset to aligned amino acids
	indices = {}
	for seq in aln:
		indices[seq.name] = (np.cumsum(np.fromstring(str(seq.seq), dtype='S1')!='-')-1)[aligned]

	# make a reduced set of amino-acid probabilities that only contains aligned positions
	aa_prob=aa_prob[indices['H1'],:]
	# attach another column for non-canonical amino acids
	aa_prob = np.hstack((aa_prob, 1e-5*np.ones((aa_prob.shape[0],1))))
	if isinstance(nodes, list):
		for node in nodes:
			node['tol'] = calc_fitness_tolerance(Seq.translate(node['seq']), 
															aa_prob, aa, indices['H3'])
	elif isinstance(nodes, dendropy.Tree):
		for node in nodes.postorder_node_iter():
			node.tol = calc_fitness_tolerance(Seq.translate(node.seq), 
															aa_prob, aa, indices['H3'])
Esempio n. 8
0
    def test_translation_on_proteins(self):
        """Check translation fails on a protein."""
        for s in protein_seqs:
            with self.assertRaises(TranslationError):
                Seq.translate(s)

            with self.assertRaises(TranslationError):
                s.translate()
Esempio n. 9
0
    def test_translation_on_proteins(self):
        """Test translation shouldn't work on a protein!"""
        for s in protein_seqs:
            with self.assertRaises(ValueError):
                Seq.translate(s)

            if isinstance(s, Seq.Seq):
                with self.assertRaises(ValueError):
                    s.translate()
Esempio n. 10
0
    def test_translation_to_stop(self):
        for nucleotide_seq in self.test_seqs:
            nucleotide_seq = nucleotide_seq[: 3 * (len(nucleotide_seq) // 3)]
            if "X" not in nucleotide_seq:
                short = Seq.translate(nucleotide_seq, to_stop=True)
                self.assertEqual(short, Seq.translate(nucleotide_seq).split("*")[0])

        seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
        self.assertEqual("VAIVMGRWKGAR", Seq.translate(seq, table=2, to_stop=True))
Esempio n. 11
0
    def test_translation_to_stop(self):
        for nucleotide_seq in self.test_seqs:
            nucleotide_seq = nucleotide_seq[:3 * (len(nucleotide_seq) // 3)]
            if isinstance(nucleotide_seq, Seq.Seq) and 'X' not in str(nucleotide_seq):
                short = Seq.translate(nucleotide_seq, to_stop=True)
                self.assertEqual(str(short), str(Seq.translate(nucleotide_seq).split('*')[0]))

        seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
        self.assertEqual("VAIVMGRWKGAR", Seq.translate(seq, table=2, to_stop=True))
Esempio n. 12
0
    def test_translation_on_proteins(self):
        """Test translation shouldn't work on a protein!"""
        for s in protein_seqs:
            with self.assertRaises(ValueError):
                Seq.translate(s)

            if isinstance(s, Seq.Seq):
                with self.assertRaises(ValueError):
                    s.translate()
Esempio n. 13
0
def calc_total_subst(start_codon, end_codon):
    """
    Returns total synonymous substitutions, nonsynonymous substitutions.
    If there are multiple positions that differ between codons, then returns the average synonynous substitutions,
    average nonsynonymous substitutions across all possible pathways from codon1 to codon2
    where each stage in a pathway is separated by 1 position mutation.
    :param Bio.Seq.Seq start_codon:  3bp codon
    :param Bio.Seq.Seq end_codon:  3bp codon
    :return tuple (int, int):  (average point mutations that yield same amino acid across all pathways, average point mutations that yield different amino acid across all pathways)
    """
    total_syn = 0.0
    total_nonsyn = 0.0
    total_subs = 0.0

    upper_start_codon = start_codon.upper()
    upper_end_codon = end_codon.upper()

    # find positions where the codons differ
    diff_pos = []
    for pos, nucstr1 in enumerate(str(upper_start_codon)):
        nucstr2 = str(upper_end_codon[pos])
        if nucstr1 != nucstr2:
            diff_pos.extend([pos])

    # Traverse all possible pathways from start_codon to end_codon where
    # each stage of a pathway mutates by 1 base.
    last_codon = upper_start_codon
    last_aa = Seq.translate(last_codon)
    for pathway in itertools.permutations(diff_pos):
        print str(upper_start_codon) + " " + str(upper_end_codon) + " " + ",".join([str(x) for x in pathway])
        for mut_pos in pathway:
            mut_nuc = upper_end_codon[mut_pos]
            mut_codon =  last_codon[:mut_pos] + mut_nuc + last_codon[mut_pos+1:]
            mut_aa = Seq.translate(mut_codon)

            total_subs += 1
            if str(last_aa) == str(mut_aa):
                total_syn += 1
            else:
                total_nonsyn += 1

            last_codon = mut_codon
            last_aa = mut_aa

        if str(last_codon) != str(upper_end_codon):
            raise ValueError("Pathway does not yield end codon " + str(last_codon))

    if total_subs:
        ave_syn = total_syn/total_subs
        ave_nonsyn = total_nonsyn/total_subs
    else:
        ave_syn = 0.0
        ave_nonsyn = 0.0
    return ave_syn, ave_nonsyn
def ex4():
    seqs_histones, seqs_bzips = read_sequences()
    seqs_histones = [Seq.translate(s, to_stop=True) for s in seqs_histones]
    seqs_bzips = [Seq.translate(s, to_stop=True) for s in seqs_bzips]

    print("histones:")
    compute_all_with_all(seqs_histones, function=compute_pair_ex4)
    print("bzips:")
    compute_all_with_all(seqs_bzips, function=compute_pair_ex4)
    print("bzips x histones")
    compute_all_with_all(seqs_histones, seqs_bzips, function=compute_pair_ex4)
Esempio n. 15
0
    def get_syn_mutations(self, region, mask_constrained=True):

        if region in self.annotation and self.annotation[region].type in [
                'gene', 'protein'
        ]:
            try:
                aft = self.get_allele_frequency_trajectories(region)
                if len(aft.mask.shape) == 0:
                    aft_valid = np.ones((aft.shape[0], aft.shape[-1]),
                                        dtype=bool)
                else:
                    aft_valid = ~np.array([af.mask.sum(axis=0) for af in aft],
                                          dtype=bool)
                gaps = self.get_gaps_by_codon(region)
                initial_seq = self.get_initial_sequence(region)
                consensi = []
                for af in aft:
                    tmp = consensus(af)
                    tmp[gaps] = 'N'
                    consensi.append(tmp)

                cons_aa = np.array([
                    np.fromstring(Seq.translate(''.join(cons.astype('U'))),
                                  dtype='S1') for cons in consensi
                ])
                no_substitution = np.repeat(
                    np.array([
                        len(np.unique(col[ind])) == 1
                        for ind, col in zip(aft_valid.T[::3], cons_aa.T)
                    ],
                             dtype=bool), 3)

                syn_muts = np.zeros(aft.shape[1:], dtype=bool)
                for pos in range(aft.shape[-1]):
                    ci = pos // 3
                    rf = pos % 3
                    codon = ''.join(initial_seq[ci * 3:(ci + 1) *
                                                3].astype("U"))
                    for ni, nuc in enumerate(alpha[:4].astype("U")):
                        mod_codon = codon[:rf] + nuc + codon[rf + 1:]
                        try:
                            syn_muts[ni,pos] = (Seq.translate(codon)==Seq.translate(mod_codon))\
                                                *no_substitution[pos]
                        except:
                            syn_muts[ni, pos] = False
                if mask_constrained:
                    syn_muts[:, self.get_constrained(region)] = False
                return syn_muts
            except:
                import ipdb
                ipdb.set_trace()
        else:
            print(region, "is not a valid protein or gene")
            return None
Esempio n. 16
0
    def test_translation_to_stop(self):
        for nucleotide_seq in self.test_seqs:
            nucleotide_seq = nucleotide_seq[:3 * (len(nucleotide_seq) // 3)]
            if isinstance(nucleotide_seq,
                          Seq.Seq) and 'X' not in str(nucleotide_seq):
                short = Seq.translate(nucleotide_seq, to_stop=True)
                self.assertEqual(
                    str(short),
                    str(Seq.translate(nucleotide_seq).split('*')[0]))

        seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
        self.assertEqual("VAIVMGRWKGAR",
                         Seq.translate(seq, table=2, to_stop=True))
def main() -> None:
    """ Make a jazz noise here """

    args = get_args()
    if seqs := [str(rec.seq) for rec in SeqIO.parse(args.file, 'fasta')]:
        rna = seqs[0].replace('T', 'U')
        orfs = set()

        for seq in [rna, Seq.reverse_complement(rna)]:
            for i in range(3):
                if prot := Seq.translate(truncate(seq[i:], 3), to_stop=False):
                    for orf in find_orfs(prot):
                        orfs.add(orf)
Esempio n. 18
0
def translateSeq(cds):
    senseOrAnti = 'sense'
    finalCDS = cds
    try:
        translated = Seq.translate(cds,cds=True)
#         finalCDS = cds
    except TranslationError,e:
        try:
            reverseCDS = Seq.reverse_complement(cds)
            translated = Seq.translate(reverseCDS,cds=True)
            finalCDS = reverseCDS
            senseOrAnti = 'anti'
        except TranslationError,e:
            print 'Translation failed in %s'%cds
Esempio n. 19
0
def parseFeatureBed(bedFile,regionSeqs):
    print 'VIVAN: Parsing %s'%bedFile
    bedFile = open(bedFile,'r').xreadlines()
    features = {}
    for line in bedFile:
        if line.strip():
            feature = Feature(line)
            regionSeq = regionSeqs[feature.region]
            feature.cds = feature.getCDS(regionSeq)
            try:
                translated = Seq.translate(feature.cds,cds=True)
            except TranslationError,e:
                translated = Seq.translate(feature.cds)
                WARNINGS.append('Translation error in feature : %s\nCDS : %s\nProtein : %s\n%s\n'%(feature.featureLine,feature.cds,translated,e))
            features[feature.name]=feature
Esempio n. 20
0
def group_by_protein(fasta_file):
    """ Groups DNA sequences based on the protein they code for.

        Args:
            fasta_file (str): path to the FASTA file with DNA
            sequences for a gene.

        Returns:
            protein_diversity (dict): dictionary with a gene
            identifier as key and another dictionary as value.
            The nested dictionary has protein sequences as keys
            and a list as value for each key. Each list has
            the allele identifiers and sequences that code for
            that protein, organized in tuples.
    """

    protein_diversity = {}
    basename = os.path.basename(fasta_file)
    protein_diversity[basename] = {}
    for record in SeqIO.parse(fasta_file, 'fasta'):
        seqid = record.id
        allele_id = seqid.split('_')[-1]
        sequence = str(record.seq)
        try:
            protein = Seq.translate(sequence, table=11, cds=True)
        except Exception:
            continue

        if protein in protein_diversity[basename]:
            protein_diversity[basename][protein][0].append((allele_id, sequence))
        else:
            protein_diversity[basename][protein] = [[(allele_id, sequence)]]

    return protein_diversity
Esempio n. 21
0
def read_reference(fname, genemap):
    try:
        ref = str(SeqIO.read(fname, 'fasta').seq)
    except:
        with open(fname, 'r') as fh:
            ref = "".join([x.strip() for x in fh])

    translations = {}
    with open(genemap, 'r') as fh:
        for line in fh:
            if line[0] == '#':
                continue
            entries = [x.strip() for x in line.strip().split('\t')]
            start = int(entries[3])
            end = int(entries[4])
            strand = entries[6]
            attributes = {
                x.split()[0]: ' '.join(x.split()[1:])
                for x in entries[8].split(';')
            }
            if 'gene_name' in attributes:
                name = attributes['gene_name'].strip('"')
            else:
                name = None
            translation = Seq.translate(
                SeqFeature.SeqFeature(
                    SeqFeature.FeatureLocation(
                        start - 1, end,
                        strand=-1 if strand == '-' else 1)).extract(ref))
            translations[name] = str(translation)

    return {"nuc": ref, "translations": translations}
Esempio n. 22
0
def translationBio(data):
    '''Uses Biopython translate '''
    proteinSeq = ''
    for line in data:
        proteinSeq += Seq.translate(line, table='Standard', stop_symbol='', to_stop=False)
        #proteinSeq += Seq.translate(line)
    print proteinSeq
Esempio n. 23
0
    def export(self, path = '', extra_attr = ['aa_muts']):
        from Bio import Seq
        from itertools import izip
        timetree_fname = path+'tree.json'
        sequence_fname = path+'sequences.json'
        tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr)
        write_json(tree_json, timetree_fname, indent=None)
        elems = {}
        elems['root'] = {}
        elems['root']['nuc'] = "".join(self.tree.root.sequence)
        for prot in self.proteins:
            tmp = str(self.proteins[prot].extract(Seq.Seq(elems['root']['nuc'])))
            #elems['root'][prot] = str(Seq.translate(tmp.replace('---', 'NNN'))).replace('X','-')
            elems['root'][prot] = str(Seq.translate(tmp.replace('-', 'N'))).replace('X','-')


        for node in self.tree.find_clades():
            if hasattr(node, "clade") and hasattr(node, "sequence"):
                elems[node.clade] = {}
                elems[node.clade]['nuc'] = {pos:state for pos, (state, ancstate) in
                                enumerate(izip(node.sequence, self.tree.root.sequence)) if state!=ancstate}
        for node in self.tree.find_clades():
            if hasattr(node, "clade") and hasattr(node, "translations"):
                for prot in self.proteins:
                    elems[node.clade][prot] = {pos:state for pos, (state, ancstate) in
                                    enumerate(izip(node.translations[prot], elems['root'][prot])) if state!=ancstate}

        write_json(elems, sequence_fname, indent=None)
Esempio n. 24
0
    def add_translations(self):
        '''
        translate the nucleotide sequence into the proteins specified
        in self.proteins. these are expected to be SeqFeatures
        '''
        from Bio import Seq

        # Sort proteins by start position of the corresponding SeqFeature entry.
        sorted_proteins = sorted(self.proteins.items(), key=lambda protein_pair: protein_pair[1].start)

        for node in self.tree.find_clades(order='preorder'):
            if not hasattr(node, "translations"):
                # Maintain genomic order of protein translations for easy
                # assembly by downstream functions.
                node.translations=OrderedDict()
                node.aa_mutations = {}

            for prot, feature in sorted_proteins:
                node.translations[prot] = Seq.translate(str(feature.extract(Seq.Seq("".join(node.sequence)))).replace('-', 'N'))

                if node.up is None:
                    node.aa_mutations[prot] = []
                else:
                    node.aa_mutations[prot] = [(a,pos,d) for pos, (a,d) in
                                               enumerate(zip(node.up.translations[prot],
                                                             node.translations[prot])) if a!=d]

        self.dump_attr.append('translations')
Esempio n. 25
0
    def RFLP_digests(self, fasta_infile):
        RFLP_digests = {}
        for fasta_record in SeqIO.parse(fasta_infile, "fasta"):

            id = str(fasta_record.id)
            sequence = str(fasta_record.seq)
            desc = str(fasta_record.description)
            print(sequence)
            #            sys.exit()
            digest_metadata = self.RFLP_digest(sequence)
            digest_metadata['ID'] = id
            digest_metadata['Description'] = desc
            digest_metadata['Nucleotide UT Sequence'] = sequence
            digest_metadata['Nucleotide UT Sequence Length'] = len(sequence)

            amino_acid_sequence = Seq.translate(fasta_record.seq,
                                                table='Standard',
                                                stop_symbol='*',
                                                to_stop=False,
                                                cds=False,
                                                gap=None)
            #            print(str(amino_acid_sequence))
            digest_metadata['Peptide UT Sequence'] = str(amino_acid_sequence)
            digest_metadata['Peptide UT Sequence Length'] = len(
                digest_metadata['Peptide UT Sequence'])
            RFLP_digests[id] = digest_metadata
        return RFLP_digests
Esempio n. 26
0
 def add_translations(self):
     from Bio import Seq
     for node in self.tree.find_clades():
         if not hasattr(node, "translations"):
             node.translations={}
         for prot in self.proteins:
             node.translations[prot] = Seq.translate(str(self.proteins[prot].extract(Seq.Seq("".join(node.sequence)))).replace('-', 'N'))
Esempio n. 27
0
 def test_stops(self):
     for nucleotide_seq in [self.misc_stops, Seq.Seq(self.misc_stops)]:
         self.assertEqual("***RR", Seq.translate(nucleotide_seq))
         self.assertEqual("***RR", Seq.translate(nucleotide_seq, table=1))
         self.assertEqual("***RR", Seq.translate(nucleotide_seq, table="SGC0"))
         self.assertEqual("**W**", Seq.translate(nucleotide_seq, table=2))
         self.assertEqual(
             "**WRR", Seq.translate(nucleotide_seq, table="Yeast Mitochondrial")
         )
         self.assertEqual("**WSS", Seq.translate(nucleotide_seq, table=5))
         self.assertEqual("**WSS", Seq.translate(nucleotide_seq, table=9))
         self.assertEqual(
             "**CRR", Seq.translate(nucleotide_seq, table="Euplotid Nuclear")
         )
         self.assertEqual("***RR", Seq.translate(nucleotide_seq, table=11))
         self.assertEqual("***RR", Seq.translate(nucleotide_seq, table="Bacterial"))
def translate(config, rc=False):
    table = 1
    if mycoplasma(config):
        # table 4 is for mycoplasma ala:
        # http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
        table = 4
    fd, fmap = None, None
    try:
        log.debug("Doing translation with table %d, rc: %s", table, rc)
        fd = os.open(ddna(config), os.O_RDONLY)
        fmap = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
        # By convention (e.g. from the C or NCBI) the DNA is is 1
        # indexed; our DDNA is a c style array that is 0 indexed
        startIdx = config['startBase'] - 1
        # The end index here is inclusive but array.slice isn't so we
        # don't need to subtract 1
        endIdx = config['endBase']
        seq = Seq.Seq(fmap[startIdx:endIdx])

        if rc:
            seq = seq.reverse_complement()
        return {
            'seq': str(seq),
            'trans': str(Seq.translate(seq, table))
        }
    finally:
        if fmap:
            fmap.close
        if fd:
            os.close(fd)
Esempio n. 29
0
    def add_translations(self):
        '''
        translate the nucleotide sequence into the proteins specified
        in self.proteins. these are expected to be SeqFeatures
        '''
        from Bio import Seq

        # Sort proteins by start position of the corresponding SeqFeature entry.
        sorted_proteins = sorted(self.proteins.items(), key=lambda protein_pair: protein_pair[1].start)

        for node in self.tree.find_clades(order='preorder'):
            if not hasattr(node, "translations"):
                # Maintain genomic order of protein translations for easy
                # assembly by downstream functions.
                node.translations=OrderedDict()
                node.aa_mutations = {}

            for prot, feature in sorted_proteins:
                node.translations[prot] = Seq.translate(str(feature.extract(Seq.Seq("".join(node.sequence)))).replace('-', 'N'))

                if node.up is None:
                    node.aa_mutations[prot] = []
                else:
                    node.aa_mutations[prot] = [(a,pos,d) for pos, (a,d) in
                                               enumerate(zip(node.up.translations[prot],
                                                             node.translations[prot])) if a!=d]

        self.dump_attr.append('translations')
Esempio n. 30
0
def getCodonTableInfo(codon_table_dict,ref_cds_coordinates_dict,proteins_pos_list,strain,nucleotide_pos,nucleotide,segment):
  codon_table_dict_copied = codon_table_dict
  for protein_pos in proteins_pos_list:
    protein_fields=protein_pos.split(":")
    protein_name=protein_fields[0]
    protein_codon_number=int(protein_fields[1])
    if protein_name not in codon_table_dict_copied[strain]: # if protein does not exist yet
      codon_table_dict_copied[strain][protein_name]={protein_codon_number:[nucleotide,str(nucleotide_pos),None,None,None]}
    else:
      if protein_codon_number not in codon_table_dict_copied[strain][protein_name]: # if protein codon number does not exist yet
        codon_table_dict_copied[strain][protein_name][protein_codon_number]=[nucleotide,str(nucleotide_pos),None,None,None]
      else:
        if codon_table_dict_copied[strain][protein_name][protein_codon_number][2] == None: # if the secod position of the codon has not been filled
          codon_table_dict_copied[strain][protein_name][protein_codon_number][2]=str(nucleotide_pos)
          codon_table_dict_copied[strain][protein_name][protein_codon_number][0]=codon_table_dict_copied[strain][protein_name][protein_codon_number][0]+nucleotide
        else:
          if codon_table_dict_copied[strain][protein_name][protein_codon_number][3] == None: # if the third position of the codon has not been filled
            codon_table_dict_copied[strain][protein_name][protein_codon_number][3]=str(nucleotide_pos)
            codon_table_dict_copied[strain][protein_name][protein_codon_number][0]=codon_table_dict_copied[strain][protein_name][protein_codon_number][0]+nucleotide
            codon = codon_table_dict_copied[strain][protein_name][protein_codon_number][0]

            aa_code = Seq.translate(codon,to_stop=False,stop_symbol='*')
            codon_table_dict_copied[strain][protein_name][protein_codon_number][4] = aa_code
          else:
            print >> sys.stderr , "\n[ERROR]: The codon \""+str(protein_codon_number)+"\" is already set in the codon table as, "+codon_table_dict_copied[strain][protein_name][protein_codon_number]+". Contact the author because this is a major issue.\n"
            sys.exit(1)
  return codon_table_dict_copied
Esempio n. 31
0
def getProtein(dna, protein):
    for i in range(1, 16):
        if i in range(7, 9):
            continue
        x = Seq.translate(dna, stop_symbol='', table=i)
        if x == protein:
            return i
def translateDNAtoAA(input_fasta, output_fasta, remove_lower_case = False):
    with open(input_fasta, 'r') as f:
        with open(output_fasta, 'w+') as g:
            for line in f.readlines():
                if line[0] == '>':
                    g.write(line)
                    continue
                else:
                    if line[-2:] == '\r\n':
                        assert(len(line) %3 == 2)
                    elif line[-1:] == '\n':
                        assert(len(line) %3 == 1)
                    if remove_lower_case:
                        g.write(Seq.translate(line.translate(None, string.ascii_lowercase)[:-1], to_stop = True) + '\n')
                    else:
                        g.write(Seq.translate(line[:-1], to_stop = True) + '\n')
 def insertion_is_synonymous_match(self, insertion, mtn):
     nt_pos = mtn["NAPosition"] + 3 - 1
     nt_ins = mtn["InsertedCodonsText"]
     recovered_insertion = indels.Insertion(
         nt_ins=nt_ins,
         nt_pos=nt_pos,
         gene=insertion.gene,
         genotype=insertion.genotype,
     )
     self.assertGreater(len(nt_ins), 0)
     mutated = insertion.mutated_gene
     aligned = recovered_insertion.mutated_gene
     self.assertEqual(
         bioseq.translate(mutated),
         bioseq.translate(aligned),
     )
Esempio n. 34
0
def extract_proteome():
	print 'EXTRAINDO PROTEOMA ...'
	global Arguments
	global BIN_PATHS
	global CWD
	global ProteomeFastaPath
	global ProteomeFastaDecoyPath
	global record
	ProteomeFastaContent = ''
	ProteomeFastaHandle = open(ProteomeFastaPath,'w')
	ProteomeCDSIndex = 0
	for FilePath in SplitFilePathString(Arguments.genome):
		FileHandle = open(FilePath)
		for Scaffold in SeqIO.parse(FileHandle,'genbank'):
			print Scaffold.id
			for Feature in Scaffold.features:
				if Feature.type == 'CDS':
					ProteomeCDSIndex += 1
					CDSSeq = Feature.location.extract(Scaffold)
					CDSProtSeq = Seq.translate(CDSSeq)
					if 'locus_tag' in Feature.qualifiers.keys():
						LocusTag = Feature.qualifiers['locus_tag'][0]
					else:
						LocusTag = 'MISSING_LOCUS_TAG'
					if 'product' in Feature.qualifiers.keys():
						Product = Feature.qualifiers['product'][0]
					else:
						Product = 'MISSING_PRODUCT'
					ProteomeFastaContent += '>{0}|{0} {1} {2} {3}\n{4}\n'.format(
	                						str(ProteomeCDSIndex), Scaffold.id,
	                                        LocusTag,Product.replace("'",""),
	                                        str(CDSProtSeq))

	# add crap contaminant proteins

	crapProteinsHandler = open('/home/cdtec/Frederico/ms6/bin/crap.fasta')
	crapProteinsParser = SeqIO.parse(crapProteinsHandler,'fasta')
	for crapProteinIndex,crapProtein in enumerate(crapProteinsParser):
		ProteomeCDSIndex += 1
		ProteomeFastaContent += '>CONTAMINANT_CRAP_{0}|{0} {1} {2}\n{3}\n'.format(
	                			str(ProteomeCDSIndex), crapProteinIndex,
	                            crapProtein.description.replace("'",""),
	                            str(crapProtein.seq))

	# add custom contaminant proteins

	if record[8]:
		customContaminantProteinsHandler = open('/home/cdtec/Frederico/ms6/jobs_data/%s/contaminants.fasta'%Arguments.job_id)
		customContaminantProteinsParser = SeqIO.parse(customContaminantProteinsHandler,'fasta')
		for customContaminantIndex,customContaminant in enumerate(customContaminantProteinsParser):
			ProteomeCDSIndex += 1
			ProteomeFastaContent += '>CONTAMINANT_CUSTOM_{0}|{0} {1} {2}\n{3}\n'.format(
	                			str(ProteomeCDSIndex), customContaminantIndex,
	                            customContaminant.description.replace("'",""),
	                            str(customContaminant.seq))

	ProteomeFastaHandle.write(ProteomeFastaContent)
	ProteomeFastaHandle.close()
	return True
Esempio n. 35
0
 def test_stops(self):
     for nucleotide_seq in [self.misc_stops, Seq.Seq(self.misc_stops),
                            Seq.Seq(self.misc_stops, Alphabet.generic_nucleotide),
                            Seq.Seq(self.misc_stops, Alphabet.DNAAlphabet()),
                            Seq.Seq(self.misc_stops, IUPAC.unambiguous_dna)]:
         self.assertEqual("***RR", str(Seq.translate(nucleotide_seq)))
         self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table=1)))
         self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table="SGC0")))
         self.assertEqual("**W**", str(Seq.translate(nucleotide_seq, table=2)))
         self.assertEqual("**WRR", str(Seq.translate(nucleotide_seq,
                                       table='Yeast Mitochondrial')))
         self.assertEqual("**WSS", str(Seq.translate(nucleotide_seq, table=5)))
         self.assertEqual("**WSS", str(Seq.translate(nucleotide_seq, table=9)))
         self.assertEqual("**CRR", str(Seq.translate(nucleotide_seq,
                                       table='Euplotid Nuclear')))
         self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table=11)))
         self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table='Bacterial')))
Esempio n. 36
0
 def test_translation(self):
     for nucleotide_seq in self.test_seqs:
         nucleotide_seq = nucleotide_seq[:3 * (len(nucleotide_seq) // 3)]
         if isinstance(nucleotide_seq,
                       Seq.Seq) and "X" not in str(nucleotide_seq):
             expected = Seq.translate(nucleotide_seq)
             self.assertEqual(repr(expected),
                              repr(nucleotide_seq.translate()))
Esempio n. 37
0
def mutationType(single_mutations):
    "Find mutations type (R/S) for single mutation"
    from Bio import Seq
    
    print len(single_mutations)
    for i in range(len(single_mutations)):
        germline = single_mutations[i][0]
        mutated = single_mutations[i][2]
        if '-' not in germline and 'N' not in germline and '-' not in mutated and 'N' not in mutated:
            if Seq.translate(germline) == Seq.translate(mutated):
                single_mutations[i].append('silent')
            else:
                single_mutations[i].append('replacement')
        else:
            single_mutations[i].append('unknown')

    return single_mutations
Esempio n. 38
0
    def test_translation_using_tables_with_ambiguous_stop_codons(self):
        """Check for error and warning messages.

        Here, 'ambiguous stop codons' means codons of unambiguous sequence
        but with a context sensitive encoding as STOP or an amino acid.
        Thus, these codons appear within the codon table in the forward
        table as well as in the list of stop codons.
        """
        seq = "ATGGGCTGA"
        with self.assertRaises(ValueError):
            Seq.translate(seq, table=28, to_stop=True)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            Seq.translate(seq, table=28)
            message = str(w[-1].message)
            self.assertTrue(message.startswith("This table contains"))
            self.assertTrue(message.endswith("be translated as amino acid."))
Esempio n. 39
0
    def test_translation_on_proteins(self):
        """Check translation fails on a protein."""
        for s in protein_seqs:
            if len(s) % 3 != 0:
                with self.assertWarns(BiopythonWarning):
                    with self.assertRaises(TranslationError):
                        Seq.translate(s)

                with self.assertWarns(BiopythonWarning):
                    with self.assertRaises(TranslationError):
                        s.translate()
            else:
                with self.assertRaises(TranslationError):
                    Seq.translate(s)

                with self.assertRaises(TranslationError):
                    s.translate()
Esempio n. 40
0
 def __init__(self, string):
     string = string.lower()
     if is_nucleotide(string):
         self.nucleotide = string
         warnings.simplefilter('ignore', BiopythonWarning)
         string = Seq.translate(string).lower()
     self.primary = string.split('*')
     self.secondary = []
     self.structures = []
 def deletion_is_synonymous_match(self, deletion, mtn):
     nt_pos = mtn["NAPosition"] - 1
     nt_count = mtn["Control"].count('-')
     recovered_deletion = indels.Deletion(
         nt_pos=nt_pos,
         gene=deletion.gene,
         genotype=deletion.genotype,
         nt_count=nt_count,
         orig_nt=None,
     )
     self.assertGreater(nt_count, 0)
     mutated = deletion.mutated_gene
     aligned = recovered_deletion.mutated_gene
     # Translate to amino-acids to account for synonymous mutations.
     self.assertEqual(
         bioseq.translate(mutated),
         bioseq.translate(aligned),
     )
Esempio n. 42
0
def calc_total_poss_subst(codon):
    total_poss_syn = 0.0
    total_poss_nonsyn = 0.0
    orig_aa = Seq.translate(codon)
    for codon_pos in range(0, Utility.NUC_PER_CODON):
        nuc = codon[codon_pos]
        for mut_str in ("A", "C", "T", "G"):
            mut = Seq.Seq(mut_str)
            if str(mut).upper() == str(nuc).upper():
                continue
            mut_codon = codon[:codon_pos] + mut + codon[codon_pos+1:]
            mut_aa = Seq.translate(mut_codon)
            if str(orig_aa).upper() == str(mut_aa).upper():
                total_poss_syn += 1
            else:
                total_poss_nonsyn += 1

    return total_poss_syn, total_poss_nonsyn
Esempio n. 43
0
def translateDNAtoAA(input_fasta, output_fasta):  
    with open(input_fasta, 'r') as f:
        with open(output_fasta, 'w+') as g:
            for line in f.readlines():
                if line[0] == '>':
                    g.write(line)
                    continue
                else:
                    assert(len(line) %3 == 1)
                    g.write(Seq.translate(line[:-1], to_stop = True) + '\n')
def check_fragments(oligo_file, design_fasta):
    design_aa_list = []
    with open(design_fasta, 'r') as f:
        for pdb, seq in izip_longest(f, f, fillvalue=None):
            if '4AC0' and 'B0' in pdb:
                block = seq[77:117]
            elif '4AC0' and 'B1' in pdb:
                block = seq[99:138]
            elif '2uxo' and 'B0' in pdb:
                block = seq[62:100]
            elif '2uxo' and 'B1' in pdb:
                block = seq[136:176]
            else:
                raise Exception('Unrecognized design name')
            design_aa_list.append(block)

    fragment_list = []
    with open(oligo_file, 'r') as o:
        for pdb, seq in izip_longest(o, o, fillvalue=None):
            if '4AC0' and 'B0' in pdb:
                seq_lower = seq.lower()
                seq_no_5p = seq_lower.split('gtgacccgtccctgggtctcaagat')[1]
                fragment = seq_no_5p.split('gccttgagaccgggcagaggtcgac')[0]
            elif '4AC0' and 'B1' in pdb:
                seq_lower = seq.lower()
                seq_no_5p = seq_lower.split('tgcccgctgtcttcaggtctcaagta')[1]
                fragment = seq_no_5p.split('catttgagacctgtagcccggcagtg')[0]
            elif '2uxo' and 'B0' in pdb:
                seq_lower = seq.lower()
                seq_no_5p = seq_lower.split('cgatcgtgcccacctggtctccactg')[1]
                fragment = seq_no_5p.split('gttctgagaccagttggagcccgcac')[0]
            elif '2uxo' and 'B1' in pdb:
                seq_lower = seq.lower()
                seq_no_5p = seq_lower.split('ctggtgcgtcgtctggtctctggat')[1]
                fragment = seq_no_5p.split('cgttggagaccggcgaacacttccc')[0]
            else:
                raise Exception('Unrecognized oligo name')
            fragment_list.append(fragment)

    missing_list = []
    for item in fragment_list:
        aa_fragment = Seq.translate(item)
        if aa_fragment in design_aa_list:
            design_aa_list.remove(aa_fragment)
        else:
            missing_list.append(aa_fragment)
    if missing_list:
        sys.stderr.write('Error: The following oligo sequences do not match a design amino acid sequence\n')
        for miss in missing_list:
            sys.stderr.write('{0}\n'.format(miss))
    if design_aa_list:
        sys.stderr.write('Error: The following design sequences do not match an oligo sequence\n')
        for design in design_aa_list:
            sys.stderr.write('{0}\n'.format(design))
    sys.stdout.write('done\n')
Esempio n. 45
0
    def get_syn_mutations(self, region, mask_constrained = True):
        from itertools import izip
        if region in self.annotation and self.annotation[region].type in ['gene', 'protein']:
            try:
                aft = self.get_allele_frequency_trajectories(region)
                if len(aft.mask.shape) == 0:
                    aft_valid = np.ones((aft.shape[0], aft.shape[-1]), dtype=bool)
                else:
                    aft_valid = -np.array([af.mask.sum(axis=0) for af in aft], dtype=bool)
                gaps = self.get_gaps_by_codon(region)
                initial_seq = self.get_initial_sequence(region)
                consensi = []
                for af in aft:
                    tmp = consensus(af)
                    tmp[gaps]='N'
                    consensi.append(tmp)

                cons_aa = np.array([np.fromstring(Seq.translate(''.join(cons)), 
                                   dtype='|S1') for cons in consensi])
                no_substitution = np.repeat(np.array([len(np.unique(col[ind]))==1 
                                for ind, col in izip(aft_valid.T[::3], cons_aa.T)], dtype=bool), 3)

                syn_muts = np.zeros(aft.shape[1:], dtype=bool)
                for pos in xrange(aft.shape[-1]):
                    ci = pos//3
                    rf = pos%3
                    codon = ''.join(initial_seq[ci*3:(ci+1)*3])
                    for ni,nuc in enumerate(alpha[:4]):
                        mod_codon = codon[:rf] + nuc + codon[rf+1:]
                        try:
                            syn_muts[ni,pos] = (Seq.translate(codon)==Seq.translate(mod_codon))\
                                                *no_substitution[pos]
                        except:
                            syn_muts[ni,pos] = False
                if mask_constrained:
                    syn_muts[:,self.get_constrained(region)] = False
                return syn_muts
            except:
                import pdb; pdb.set_trace()
        else:
            print region,"is not a valid protein or gene"
            return None
Esempio n. 46
0
    def get_protein_seq(self, transcript_id):
        gaf_record = self.get_transcript(transcript_id)
        tx_seq = self.get_transcript_seq(transcript_id)
        if not gaf_record or not tx_seq:
            return None

        if "cds_start" not in gaf_record or not gaf_record["cds_start"]:
            return None

        prot_seq = Seq.translate(tx_seq[gaf_record["cds_start"] - 1 : gaf_record["cds_stop"]])
        if prot_seq[-1] == "*":
            prot_seq = prot_seq[:-1]

        return prot_seq
def create_sequence_dbs_for_GAF(gaf, transcripts_file, output_dir):
    from Bio import SeqIO
    from Bio import Seq
    import os

    print "Indexing GAF db by transcript id...\n"
    gaf_transcript_idx = dict()
    for i,g in enumerate(gaf):
        for k in gaf[g].keys():
            for ctr,t in enumerate(gaf[g][k]):
                gaf_transcript_idx[t['transcript_id']] = (ctr,g,k)

    fh_transcripts = SeqIO.parse(transcripts_file, 'fasta')
    # transcripts_shlv = shelve.open(os.path.join(output_dir, 'GAF_transcript_seqs.fa.shlv'), 'c')
    # proteins_shlv = shelve.open(os.path.join(output_dir, 'GAF_protein_seqs.fa.shlv'), 'c')
    transcripts_shlv = Shove("file://" + os.path.join(output_dir, 'GAF_transcript_seqs.fa.shove'))
    protein_seqs_url = "file://" + os.path.join(output_dir, 'GAF_protein_seqs.fa.shove')
    proteins_shlv = Shove(protein_seqs_url)

    print "Writing transcript and protein shove dbs..."
    j = 0
    transcripts_to_remove = list()
    for transcript in fh_transcripts:
        if j % 1000 == 0: print j
        j += 1
        if transcript.name not in gaf_transcript_idx:
            continue
        gaf_record = gaf[gaf_transcript_idx[transcript.name][1]][gaf_transcript_idx[transcript.name][2]][gaf_transcript_idx[transcript.name][0]]
        raw_seq = str(transcript.seq)
        transcripts_shlv[transcript.name] = raw_seq
        if 'cds_start' not in gaf_record or not gaf_record['cds_start']: continue
        prot_seq = Seq.translate(raw_seq[gaf_record['cds_start']-1:gaf_record['cds_stop']])
        if prot_seq[-1] == '*':
            prot_seq = prot_seq[:-1]
        elif prot_seq.find('*') != -1:
            # skip small number (n=12) transcripts with incorrect CDS coordinates
            transcripts_to_remove.append(transcript.name)
            continue

        proteins_shlv[transcript.name] = prot_seq


    for t in transcripts_to_remove:
        del transcripts_shlv[t]

    transcripts_shlv.close()
    proteins_shlv.close()

    return transcripts_to_remove,protein_seqs_url
Esempio n. 48
0
def applyBias( multiple_mutations,multiple_group,bias ):
    "Determine types for multi mutations using pre-set bias"
    from itertools import permutations
    from Bio import Seq
    from collections import Counter
    
    counted = [0]*len(multiple_mutations)
    for mutation in multiple_mutations:
        if counted[multiple_mutations.index(mutation)] != 1:
            mismatch_positions = [i for i in range(len(mutation[0])) if mutation[0][i]!=mutation[2][i]]
            p = list(permutations(mismatch_positions))
            type_list = []
            type_count = []
            for i in range(len(p)):
                types = []
                germline = mutation[0]
                for j in range(len(p[i])):
                    mutated = germline[:p[i][j]] + mutation[2][p[i][j]] + germline[p[i][j]+1:]
                    if '-' not in germline and 'N' not in germline and '-' not in mutated and 'N' not in mutated:
                        if Seq.translate(germline) == Seq.translate(mutated):
                            types.append('silent')
                        else:
                            types.append('replacement')
                    else:
                        types.append('unkown')   
                    germline = mutated
                type_list.append(types)
                type_frequency = Counter(types)
                type_count.append(type_frequency[bias])
            type_list = type_list[type_count.index(max(type_count))]
                         
            indices = [i for i, x in enumerate(multiple_group) if x == multiple_group[multiple_mutations.index(mutation)]]
            for idx in indices:
                counted[idx] = 1
                multiple_mutations[idx].append(type_list[indices.index(idx)])
    return multiple_mutations          
Esempio n. 49
0
def translate(nuc):
	"""Translate nucleotide sequence to amino acid"""
	from Bio import Seq
	try:
		tmp_aa = Seq.translate(nuc.replace('-','N')) #returns string when argument is a string, Bio.Seq otherwise
	except:
		print("translation failed",nuc)
		tmp_aa = 'X'*len(nuc)//3
	aa_seq = ""
	for i,aa in enumerate(tmp_aa):
		if nuc[i*3:(i+1)*3]=='---':
			aa_seq+='-'
		else:
			aa_seq+=aa
	return aa_seq
Esempio n. 50
0
def translate(seq):
    r = {}
    r['First Frame'] = Seq.translate(seq)
    r['Second Frame'] = Seq.translate(seq[1:])
    r['Third Frame'] = Seq.translate(seq[2:])
    seq = Seq.reverse_complement(seq)
    r['Complement First Frame'] = Seq.translate(seq)
    r['Complement Second Frame'] = Seq.translate(seq[1:])
    r['Complement Third Frame'] = Seq.translate(seq[2:])
    return r
Esempio n. 51
0
 def translate_sequence(input_seq):
     """Wrapper for Biopython translate function.  Bio.Seq.translate will complain if input sequence is 
     not a mulitple of 3.  This wrapper function passes an acceptable input to Bio.Seq.translate in order to
     avoid this warning."""
 
     trailing_bases = len(input_seq) % 3
 
     if trailing_bases:
         input_seq = ''.join([input_seq, 'NN']) if trailing_bases == 1 else ''.join([input_seq, 'N'])
 
     output_seq = Seq.translate(input_seq)
 
     if trailing_bases:
         #remove last residue if input needed to be extended because of trailing bases
         output_seq = output_seq[:-1]
 
     return output_seq
Esempio n. 52
0
    def test_translation_using_cds(self):
        seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
        self.assertEqual("MAIVMGRWKGAR", Seq.translate(seq, table=2, cds=True))

        seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCG"  # not multiple of three
        with self.assertRaises(TranslationError):
            Seq.translate(seq, table=2, cds=True)

        seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA"  # no stop codon
        with self.assertRaises(TranslationError):
            Seq.translate(seq, table=2, cds=True)

        seq = "GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"  # no start codon
        with self.assertRaises(TranslationError):
            Seq.translate(seq, table=2, cds=True)
def translate_en_orf6():
    for tgt in tgt_list:
        input_handle  = open(path_base+tgt)
        output_fname  = path_base+'.'.join(tgt.split('.')[:-1])+'_translate.fasta'
        print output_fname
        output_handle = open(output_fname,'w')
        data = SeqIO.parse(input_handle,'fasta')
        for record in data:
            frame01 = SeqRecord(Seq.translate(record.seq[0:]),id=record.id+'ORF1|',
                                name=record.name+'ORF1|',description=record.description)
            frame02 = SeqRecord(Seq.translate(record.seq[1:]),id=record.id+'ORF2|',
                                name=record.name+'ORF2|',description=record.description)
            frame03 = SeqRecord(Seq.translate(record.seq[2:]),id=record.id+'ORF3|',
                                name=record.name+'ORF3|',description=record.description)
            frame04 = SeqRecord(Seq.translate(record.reverse_complement().seq[0:]),id=record.id+'ORF4|',
                                name=record.name+'ORF4|',description=record.description)
            frame05 = SeqRecord(Seq.translate(record.reverse_complement().seq[1:]),id=record.id+'ORF5|',
                                name=record.name+'ORF5|',description=record.description)
            frame06 = SeqRecord(Seq.translate(record.reverse_complement().seq[2:]),id=record.id+'ORF6|',
                                name=record.name+'ORF6|',description=record.description)
            SeqIO.write([frame01,frame02,frame03,frame04,frame05,frame06], output_handle, "fasta")
        output_handle.close()
Esempio n. 54
0
        print s.complement()
        assert False, "Complement shouldn't work on a protein!"
    except ValueError :
        pass
    try :
        print s.reverse_complement()
        assert False, "Reverse complement shouldn't work on a protein!"
    except ValueError :
        pass
   
print
print "Translating"
print "==========="
for nucleotide_seq in test_seqs:
    try :
        expected = Seq.translate(nucleotide_seq)
        print "%s\n-> %s" \
        % (repr(nucleotide_seq) , repr(expected))
    except (ValueError, TranslationError), e :
        expected = None
        print "%s\n-> %s" \
        % (repr(nucleotide_seq) , str(e))
    #Now test the Seq object's method
    if isinstance(nucleotide_seq, Seq.Seq) :
        try :
            assert repr(expected) == repr(nucleotide_seq.translate())
        except (ValueError, TranslationError) :
            assert expected is None
    #Now check translate(..., to_stop=True)
    try :
        short = Seq.translate(nucleotide_seq, to_stop=True)
Esempio n. 55
0
def translate(nuc, to_stop=False):
    """Translate nucleotide sequence to amino acid"""
    from Bio import Seq

    return Seq.translate(nuc, to_stop=to_stop)  # returns string when argument is a string, Bio.Seq otherwise
Esempio n. 56
0
def access_mixed_aa(file_name):
    """(str) ->(list,list,list,list).
    Return a list of amino acide code for ambiguous dna codon, position of
    ambiguous nt codon, aa name,seq id from fasta header  by reading multifasta
    nucleotide fasta file
    """
    from Bio import SeqIO
    aa = []
    nucleotide_idx = []
    nucl_codon = []
    seqids = []
    for seq_record in SeqIO.parse(file_name, 'fasta'):
        seq_id = seq_record.id
        seq_len = len(seq_record)
        header, seqline = seq_record.id, str(seq_record.seq)
    # for header, seqline in readFasta(file_name):
        # print header + "\n" + seq_line

        # my_seq = Seq(seq_line, IUPAC.extended_dna)
        my_seq = Seq(str(seqline), IUPAC.ambiguous_dna)
        # seq2 = Seq("ARAWTAGKAMTA", IUPAC.ambiguous_dna)
        # seq2 = seq2.translate()
        # print seq2
        # print ambiguous_dna_values["W"]
        # print IUPAC.ambiguous_dna.letters
        seqline = seqline.replace("-", "N")
        n = 3
        codon_list = {i + n: seqline[i:i + n] for i in range(0, len(seqline), n)}
        # print yaml.dump(ambi_codon)
        # print yaml.dump(codon_list)
        ambi_nucl = AMBICODON.keys()
        # print ambi_nucl
        # print ambi_codon["Y"]
        for key, codon in sorted(codon_list.iteritems()):
            # print "key: ", key , "codon:", codon
            if list_overlap(codon, ambi_nucl):
                d, e, f = codon
                m = [d, e, f]
                # print codon, ".....", key
                # print type(ambi_nucl)
                items = set(m).intersection(ambi_nucl)
                indexm = m.index(list(items)[0])
                # print "index ...", indexm
                items = list(items)      # eg. ['R']
                for idx, val in enumerate(items):
                    # print idx
                    # print val
                    codonlist = list(nearbyPermutations(codon))
                    # print "codon list :", codonlist
                    val = getaalist(codonlist)
                    # remove if aa codon is the same eg. ['D', 'D']
                    val = list(set(val))
                    val = "/".join(val)   # yeild 'I/L'
                    val = str(val)
                    # print "codonlist *****", codonlist
                    # print "aa val *******", val
                    if "/" in val and indexm == 2:
                        key = key
                        nucleotide_idx.append(key)
                        nucl_codon.append(codon)
                        seqids.append(seq_id)
                    elif "/" in val and indexm == 1:
                        key = key - 1
                        nucleotide_idx.append(key)
                        nucl_codon.append(codon)
                        seqids.append(seq_id)
                    elif "/" in val and indexm == 0:
                        key = key - 2
                        nucleotide_idx.append(key)
                        nucl_codon.append(codon)
                        seqids.append(seq_id)
                    else:
                        pass
                    # print ".....", val
                    aa.append(val)

            else:
                # print "codon3 ..." ,codon
                aa1 = Seq(codon, IUPAC.unambiguous_dna)
                aa1 = aa1.translate()
                aa1 = str(aa1)
                aa.append(aa1)
    #print aa, nucleotide_idx, nucl_codon, seqids
    return aa, nucleotide_idx, nucl_codon, seqids
Esempio n. 57
0
from pileup_user import read_fa_file
from Bio import Seq

reference = read_fa_file("data/reference.fa")

print "Original sequence: " + Seq.translate(reference)[274]

mutated_reference = ""
for i, x in enumerate(reference):
    if i == 822:
        mutated_reference += "T"
    else:
        mutated_reference += x

print "Read sequence: " + Seq.translate(mutated_reference)[274]