Ejemplo n.º 1
0
def disambiguateaa(input, output):
    """Replace IUPAC ambiguous amino acids with unambiguous ones

    Specifically, make the following replacements:
    B => DN
    X => ACDEFGHIKLMNPQRSTVWY
    Z => EQ
    J => LI,
    U => C (selenocysteine)
    O => K (pyrrolysine)

    If there are multiple possible replacements, this operation will output a
    sequence for each possible option. Use caution with sequences that are
    highly ambiguous (e.g., with many Xs), as in this case a single sequence
    could lead to an explosion in the output.

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    for (name, ambig, qual) in readfq(input):
        n = num_disambiguated_iupac_aa(ambig)
        digits = floor(log10(n)) + 1
        fmt = f"{name}|disambig_{{:0{digits}d}}"
        for (i, unambig) in enumerate(disambiguate_iupac_aa(ambig)):
            if n > 1:
                name = fmt.format(i + 1)
            print(f">{name}\n{unambig}", file=output)
Ejemplo n.º 2
0
 def test_O(self):
     ambig = Seq("AAOAA")
     disambig = {str(p) for p in disambiguate_iupac_aa(ambig)}
     assert disambig == {"AAKAA"}
Ejemplo n.º 3
0
 def test_adjacent_ambig(self):
     ambig = Seq("AAJJAA")
     # map to str here bc of annoying biopython warning when hashing a Seq
     proteins = set(map(str, disambiguate_iupac_aa(ambig)))
     assert len(proteins) == 4
     assert proteins == {"AALLAA", "AALIAA", "AAILAA", "AAIIAA"}
Ejemplo n.º 4
0
 def test_Z(self):
     ambig = Seq("AAZAA")
     disambig = {str(p) for p in disambiguate_iupac_aa(ambig)}
     assert disambig == {"AAEAA", "AAQAA"}
Ejemplo n.º 5
0
 def test_J(self):
     ambig = Seq("AAJAA")
     disambig = {str(p) for p in disambiguate_iupac_aa(ambig)}
     assert disambig == {"AALAA", "AAIAA"}
Ejemplo n.º 6
0
 def test_X(self):
     ambig = Seq("AAXAA")
     disambig = {str(p) for p in disambiguate_iupac_aa(ambig)}
     assert disambig == {"AA{}AA".format(aa) for aa in all_aa_protein_seq}
Ejemplo n.º 7
0
 def test_B(self):
     ambig = Seq("AABAA")
     disambig = {str(p) for p in disambiguate_iupac_aa(ambig)}
     assert disambig == {"AADAA", "AANAA"}
Ejemplo n.º 8
0
 def test_unambig(self):
     proteins = list(disambiguate_iupac_aa(all_aa_protein_seq))
     assert len(proteins) == 1
     assert proteins[0] == all_aa_protein_seq
Ejemplo n.º 9
0
 def test_adjacent_ambig(self):
     ambig = Seq("AAJJAA", protein)
     proteins = set(disambiguate_iupac_aa(ambig))
     assert len(proteins) == 4
     assert proteins == {"AALLAA", "AALIAA", "AAILAA", "AAIIAA"}
Ejemplo n.º 10
0
 def test_U(self):
     ambig = Seq("AAUAA", protein)
     disambig = {str(p) for p in disambiguate_iupac_aa(ambig)}
     assert disambig == {"AACAA"}