Ejemplo n.º 1
0
print(coding_dna.translate(to_stop=True))

from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
print(mito_table)
print(mito_table.stop_codons)
print(mito_table.start_codons)
print(mito_table.forward_table["ACG"])

my_seq[1] = "N"

mutable_seq = my_seq.tomutable()
# or
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA')

mutable_seq[5] = "A"
print(mutable_seq)
del mutable_seq[4]
mutable_seq.remove('A')
print(mutable_seq)
new_seq = mutable_seq.toseq()
print(new_seq)

from Bio.Seq import UnknownSeq
unk = UnknownSeq(10)
print(unk)
unk = UnknownSeq(10, character="A")
print(unk)
unk_protein = unk.translate()
Ejemplo n.º 2
0
def clean_seqs(fasta_in,fasta_out=None,filter_include_expression=None,filter_exclude_expression=None,bp_ranges=None,start_date=None,end_date=None,ungap=None):
    iso_date_re = re.compile(r'(\d{4}-\d{2}-\d{2})')

    bp_ranges = bp_ranges or []
    
    bp_range_str          = "_".join([str(t[0])+"-"+str(t[1])+"bp" for t in bp_ranges])
    start_date_str        = "" if not start_date else "starting_"+start_date.strftime("%Y-%m-%d")
    end_date_str          = "" if not end_date else "ending_"+end_date.strftime("%Y-%m-%d")
    filter_include_str    = "" if not filter_include_expression else "only_subset_by_filter"
    filter_exclude_str    = "" if not filter_exclude_expression else "excluding_some_by_filter"
    output_summary_string = "_".join(s for s in [bp_range_str,start_date_str,end_date_str,filter_include_str,filter_exclude_str] if len(s)>0)

    if len(output_summary_string)>0:
        output_summary_string="_"+output_summary_string

    in_fasta_basename = os.path.splitext(os.path.basename(fasta_in.name))[0]
    out_basedir       = os.path.realpath(os.path.dirname(fasta_in.name))

    out_filepath = fasta_out or os.path.join(out_basedir,in_fasta_basename+"_cleaned"+output_summary_string+".fasta")

    if os.path.exists(out_filepath):
        raise IOError("%s already exists; skipping..." % out_filepath)

    if filter_include_expression:
        filter_include_re = re.compile(filter_include_expression)
    if filter_exclude_expression:
        filter_exclude_re = re.compile(filter_exclude_expression)

    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=80) # wrap=None
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in.name, "fasta"):
            should_output=True
            if filter_include_expression:
                should_output=False
                if filter_include_re.search(record.id) or filter_include_re.search(record.description):
                    should_output=True
            
            if filter_exclude_expression and (filter_exclude_re.search(record.id) or filter_exclude_re.search(record.description)):
                should_output=False

            if start_date:
                for field in [record.description,record.id]:
                    match = iso_date_re.search(field)
                    if match:
                        seq_date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d")
                        if seq_date<start_date:
                            should_output=False

            if end_date:
                for field in [record.description,record.id]:
                    match = iso_date_re.search(field)
                    if match:
                        seq_date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d")
                        if seq_date>end_date:
                            should_output=False
            
            if should_output:
                
                if len(bp_ranges)==0:
                    record.seq=MutableSeq(str(record.seq).upper(), DNAAlphabet())
                else:
                    output_seq=MutableSeq("", DNAAlphabet())
                    for start,end in bp_ranges:
                        start-=1 # remove one since biopython seqs are zero-indexed
                        # end-=1 # remove one since biopython seqs are zero-indexed; not needed because slice upper is exclusive
                        start=max(start,0) # bound to limit of sequence
                        end=min(end,len(record)) # bound to limit of sequence
                        output_seq+=record.seq[start:end]
                    record.seq=Seq(str(output_seq).upper(),DNAAlphabet())

                if ungap!=None:
                    record.seq=Seq(str(record.seq).upper(),DNAAlphabet()).ungap(ungap)

                #record.id=copy.deepcopy(record.id).replace(" ",CHARACTER_TO_USE)
                #record.description=copy.deepcopy(record.description).replace(" ",CHARACTER_TO_USE)
                # set the id to the description, which is the ID in the case of GISAID
                # and remove the description. 
                record.id=copy.deepcopy(record.description).replace(" ",CHARACTER_TO_USE)
                record.description=""
                fasta_out.write_record(record)
Ejemplo n.º 3
0
def genome_generator():
    """Generate a genome for testing purposes.
    """
    return MutableSeq("1234", TestAlphabet())
Ejemplo n.º 4
0
 def test_reverse_complement_mutable_seq(self):
     s = SeqRecord(MutableSeq("ACTG"))
     self.assertEqual("CAGT", str(s.reverse_complement().seq))
Ejemplo n.º 5
0
 def setUp(self):
     genome = MutableSeq("1111", TestAlphabet())
     self.organism = Organism(genome, test_fitness)
Ejemplo n.º 6
0
lowerCheck = int(covInfo[1])
upperCov = int(float(covInfo[2]))
if lowerCheck < lowerCov:
    #print('Lower < 10')
    lowerCov = lowerCheck
maskingInfo = maskingInfo + strainName + ":\tlowLimit=" + str(
    lowerCov) + "\tupperLimit=" + str(upperCov) + "\n"
strainDict = {}
genomeDict = {}
#fastaToChangeName = strainName+".fasta"
fastaToChange = open(fastaToChangeName, 'r')
for seq_record in SeqIO.parse(fastaToChange, "fasta"):
    strainDict[seq_record.id] = seq_record.seq
    idStr = str(seq_record.id)
    seqStr = str(seq_record.seq)
    genomeDict[idStr] = MutableSeq(seqStr, IUPAC.IUPACAmbiguousDNA())
Ncount = 0
for key in genomeDict:
    Ncount = Ncount + genomeDict[key].count("N")
lenToMaskUpper = 0
lenToMaskLower = 0
existingN = 0
mito = 0
strainBed = strainName + ".bedgraph"
bed = open(strainBed, 'r')
lines = bed.readlines()
for line in lines:
    currentLine = line.strip('\n')
    info = currentLine.split()
    chrom = info[0]
    chromstart = int(info[1]) - 1
Ejemplo n.º 7
0
    "AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAT" +
    "TATGAATGGCGAGGCAATCGCTGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT" +
    "AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA", generic_dna)

print(gene.translate(table="Bacterial"))
print(gene.translate(table="Bacterial", cds=True))

##查看密码子表
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_id[2]

print(standard_table)
print(mito_table.start_codons)
print(mito_table.stop_codons)
print(mito_table.forward_table["ACG"])

##可变对象
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA",
                         IUPAC.unambiguous_dna)
print(mutable_seq)
mutable_seq[5] = "C"
print(mutable_seq)
mutable_seq.remove("T")
print(mutable_seq)
mutable_seq.reverse()
print(mutable_seq)
new_seq = mutable_seq.toseq()
print(new_seq)
Ejemplo n.º 8
0
 def test_tomutable(self):
     """Check creating a MutableSeq object."""
     for example1 in self._examples:
         mut = MutableSeq(example1)
         self.assertIsInstance(mut, MutableSeq)
         self.assertEqual(mut, example1)
Ejemplo n.º 9
0
    def create_clusters_from_bowtie(self):
        """
		The 'offset' field is actually 'abundance'
		The 'ref' field is actually 'cycle' offset
		"""
        with open(self.otu_txt) as f:
            for line in f:
                otuid, rest = line.strip().split(None, 1)
                for x in rest.split():
                    self.otu_info[x] = otuid
                self.cluster_by_otu[otuid] = {}

        for r in BowTieReader(self.input_bowtie, False):
            cid = r['ID']
            otuid = self.otu_info[r['ID']]
            self.cluster_by_otu[otuid][cid] = {'dirty':True, 'cids':[cid], 'len':len(r['seq']), 'seq': MutableSeq(r['seq']), 'size':int(r['offset']), \
              'qual': [ord(x)-33 for x in r['qual']], 'cycle': range(int(r['ref']), int(r['ref'])+len(r['seq']))}
Ejemplo n.º 10
0
def parse_vcf(varfile):
    reader = csv.reader(open(varfile), "excel-tab")
    for line in reader:
        if line[0][0] == "#":
            continue

        pos = int(line[1]) - 1
        var = line[4].split(',')

        yield pos, var


for seq_record in SeqIO.parse(sys.argv[1], 'fasta'):
    print >> sys.stderr, "Seq ID = %s, Length = %d" % \
                                        (seq_record.id, len(seq_record))
    seq = MutableSeq(str(seq_record.seq))

    n = 0
    for pos, var in parse_vcf(sys.argv[2]):
        # if (len(var) > 2) or (len(var[0]) > 1):
        # continue
        if (len(var) > 1) or (len(var[0]) > 1):
            continue
        else:
            seq[pos] = var[0]
            n += 1

    SeqIO.write(SeqRecord(Seq(str(seq)), id=seq_record.id), sys.stdout,
                'fasta')

    print >> sys.stderr, "Total variants = %d" % n
Ejemplo n.º 11
0
class StringMethodTests(unittest.TestCase):
    _examples = [
        # These are length 9, a multiple of 3 for translation tests:
        Seq("ACGTGGGGT"),
        Seq("ACGUGGGGU"),
        Seq("GG"),
        Seq("A"),
        UnknownSeq(1),
        UnknownSeq(1, character="n"),
        UnknownSeq(1, character="N"),
        UnknownSeq(12, character="N"),
        UnknownSeq(12, character="X"),
        UnknownSeq(12),
    ]
    for seq in _examples[:]:
        if not isinstance(seq, UnknownSeq):
            _examples.append(MutableSeq(seq))
    _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None]

    def _test_method(self, method_name, start_end=False):
        """Check this method matches the plain string's method."""
        self.assertIsInstance(method_name, str)
        for example1 in self._examples:
            if not hasattr(example1, method_name):
                # e.g. MutableSeq does not support transcribe
                continue
            str1 = str(example1)

            for example2 in self._examples:
                if not hasattr(example2, method_name):
                    # e.g. MutableSeq does not support transcribe
                    continue
                str2 = str(example2)

                try:
                    i = getattr(example1, method_name)(str2)
                except ValueError:
                    i = ValueError
                try:
                    j = getattr(str1, method_name)(str2)
                except ValueError:
                    j = ValueError
                self.assertEqual(i, j,
                                 "%r.%s(%r)" % (example1, method_name, str2))
                try:
                    i = getattr(example1, method_name)(example2)
                except ValueError:
                    i = ValueError
                try:
                    j = getattr(str1, method_name)(str2)
                except ValueError:
                    j = ValueError
                self.assertEqual(
                    i, j, "%r.%s(%r)" % (example1, method_name, example2))

                if start_end:
                    for start in self._start_end_values:
                        try:
                            i = getattr(example1, method_name)(str2, start)
                        except ValueError:
                            i = ValueError
                        try:
                            j = getattr(str1, method_name)(str2, start)
                        except ValueError:
                            j = ValueError
                        self.assertEqual(
                            i, j, "%r.%s(%r, %s)" %
                            (example1, method_name, str2, start))

                        for end in self._start_end_values:
                            try:
                                i = getattr(example1, method_name)(str2, start,
                                                                   end)
                            except ValueError:
                                i = ValueError
                            try:
                                j = getattr(str1, method_name)(str2, start,
                                                               end)
                            except ValueError:
                                j = ValueError
                            self.assertEqual(
                                i,
                                j,
                                "%r.%s(%r, %s, %s)" %
                                (example1, method_name, str2, start, end),
                            )

    def test_str_count(self):
        """Check matches the python string count method."""
        self._test_method("count", start_end=True)
        self.assertEqual(Seq("AC777GT").count("7"), 3)
        self.assertRaises(TypeError, Seq("AC777GT").count, 7)
        self.assertRaises(TypeError, Seq("AC777GT").count, None)

    def test_count_overlap(self):
        """Check count_overlap exception matches python string count method."""
        self.assertEqual(Seq("AC777GT").count("77"), 1)
        self.assertEqual(Seq("AC777GT").count_overlap("77"), 2)
        self.assertEqual(Seq("AC777GT").count_overlap("7"), 3)
        self.assertRaises(TypeError, Seq("AC777GT").count_overlap, 7)
        self.assertRaises(TypeError, Seq("AC777GT").count_overlap, None)

    def test_str_count_overlap_GG(self):
        """Check our count_overlap method using GG."""
        # Testing with self._examples
        expected = [
            3,
            3,
            1,
            0,  # Seq() Tests
            0,
            0,
            0,
            0,
            0,
            0,  # UnknownSeq() Tests
            3,
            3,
            1,
            0,  # MutableSeq() Tests
        ]

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term GG as a string
            self.assertEqual(seq.count_overlap("GG"), exp)
            self.assertEqual(seq.count_overlap("G" * 5), 0)
            # Using search term GG as a Seq
            self.assertEqual(seq.count_overlap(Seq("GG")), exp)
            self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0)

    def test_count_overlap_start_end_GG(self):
        """Check our count_overlap method using GG with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 3),
            (3, None, 3),
            (3, 6, 2),
            (4, 6, 1),
            (4, -1, 2),
            (-5, None, 2),
            (-5, 7, 2),
            (7, -5, 0),
            (-100, None, 3),
            (None, 100, 3),
            (-100, 1000, 3),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("GG", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("GG", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 0),
            ("N", 1, 7, 0),
            ("N", -4, None, 0),
            ("N", -4, None, 0),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("GG", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("G", 100, 105, 0),
            ("G", -1, 4, 0),
            ("G", 4, -1, 0),
            ("G", -8, -2, 0),
            ("G", -2, -8, 0),
            ("G", 8, 2, 0),
            ("G", 2, 8, 0),
            ("GG", 8, 2, 0),
            ("GG", 2, 8, 0),
            ("GG", -5, -1, 0),
            ("GG", 1, 5, 0),
            ("GGG", None, None, 0),
            ("GGGGGGGGG", None, None, 0),
            ("GGG", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("GG", 1), 0)

    def test_str_count_overlap_NN(self):
        """Check our count_overlap method using NN."""
        # Testing with self._examples
        expected = [
            0,
            0,
            0,
            0,  # Seq() Tests
            0,
            0,
            0,
            11,
            0,
            0,  # UnknownSeq() Tests
            0,
            0,
            0,
            0,  # MutableSeq() Tests
        ]

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term NN as a string
            self.assertEqual(seq.count_overlap("NN"), exp)
            self.assertEqual(seq.count_overlap("N" * 13), 0)
            # Using search term NN as a Seq
            self.assertEqual(seq.count_overlap(Seq("NN")), exp)
            self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0)

    def test_count_overlap_start_end_NN(self):
        """Check our count_overlap method using NN with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 0),
            (3, None, 0),
            (3, 6, 0),
            (4, 6, 0),
            (4, -1, 0),
            (-5, None, 0),
            (-5, 7, 0),
            (7, -5, 0),
            (-100, None, 0),
            (None, 100, 0),
            (-100, 1000, 0),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("NN", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("NN", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 5),
            ("N", 1, 7, 5),
            ("N", -4, None, 3),
            ("N", -4, None, 3),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("NN", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("N", 100, 105, 0),
            ("N", -1, 4, 0),
            ("N", 4, -1, 2),
            ("N", -8, -2, 5),
            ("N", -2, -8, 0),
            ("N", 8, 2, 0),
            ("N", 2, 8, 5),
            ("NN", 8, 2, 0),
            ("NN", 2, 8, 4),
            ("NN", -5, -1, 3),
            ("NN", 1, 5, 3),
            ("NNN", None, None, 5),
            ("NNNNNNNNN", None, None, 0),
            ("NNN", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("NN", 1), 5)

    def test_str_find(self):
        """Check matches the python string find method."""
        self._test_method("find", start_end=True)
        self.assertEqual(Seq("AC7GT").find("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").find, 7)
        self.assertRaises(TypeError, Seq("ACGT").find, None)

    def test_str_rfind(self):
        """Check matches the python string rfind method."""
        self._test_method("rfind", start_end=True)
        self.assertEqual(Seq("AC7GT").rfind("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").rfind, 7)
        self.assertRaises(TypeError, Seq("ACGT").rfind, None)

    def test_str_index(self):
        """Check matches the python string index method."""
        self._test_method("index", start_end=True)
        self.assertEqual(Seq("AC7GT").index("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").index, 7)
        self.assertRaises(TypeError, Seq("ACGT").index, None)
        self.assertEqual(MutableSeq("AC7GT").index("7"), 2)
        self.assertRaises(TypeError, MutableSeq("AC7GT").index, 7)
        self.assertRaises(TypeError, MutableSeq("ACGT").index, None)

    def test_str_rindex(self):
        """Check matches the python string rindex method."""
        self._test_method("rindex", start_end=True)
        self.assertEqual(Seq("AC7GT").rindex("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").rindex, 7)
        self.assertRaises(TypeError, Seq("ACGT").rindex, None)
        self.assertEqual(MutableSeq("AC7GT").rindex("7"), 2)
        self.assertRaises(TypeError, MutableSeq("AC7GT").rindex, 7)
        self.assertRaises(TypeError, MutableSeq("ACGT").rindex, None)

    def test_str_startswith(self):
        """Check matches the python string startswith method."""
        self._test_method("startswith", start_end=True)
        self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC")))
        self.assertRaises(TypeError, Seq("ACGT").startswith, None)
        self.assertRaises(TypeError, MutableSeq("ACGT").startswith, None)

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            subs = tuple(example1[start:start + 2]
                         for start in range(0,
                                            len(example1) - 2, 3))
            subs_str = tuple(str(s) for s in subs)

            self.assertEqual(
                str(example1).startswith(subs_str), example1.startswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).startswith(subs_str, 3),
                example1.startswith(subs, 3))
            self.assertEqual(
                str(example1).startswith(subs_str, 2, 6),
                example1.startswith(subs, 2, 6),
            )

    def test_str_endswith(self):
        """Check matches the python string endswith method."""
        self._test_method("endswith", start_end=True)
        self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE")))
        self.assertRaises(TypeError, Seq("ACGT").endswith, None)

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            subs = tuple(example1[start:start + 2]
                         for start in range(0,
                                            len(example1) - 2, 3))
            subs_str = tuple(str(s) for s in subs)

            self.assertEqual(
                str(example1).endswith(subs_str), example1.endswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).endswith(subs_str, 3),
                example1.endswith(subs, 3))
            self.assertEqual(
                str(example1).endswith(subs_str, 2, 6),
                example1.endswith(subs, 2, 6))

    def test_str_strip(self):
        """Check matches the python string strip method."""
        self._test_method("strip")
        s = Seq(" ACGT ")
        m = MutableSeq(" ACGT ")
        self.assertEqual(s.strip(), "ACGT")
        self.assertRaises(TypeError, s.strip, 7)
        self.assertEqual(s, " ACGT ")
        self.assertEqual(m.strip(), "ACGT")
        self.assertRaises(TypeError, m.strip, 7)
        self.assertEqual(m, " ACGT ")
        self.assertEqual(m.strip(inplace=True), "ACGT")
        self.assertEqual(m, "ACGT")

    def test_str_lstrip(self):
        """Check matches the python string lstrip method."""
        self._test_method("lstrip")
        s = Seq(" ACGT ")
        m = MutableSeq(" ACGT ")
        self.assertEqual(s.lstrip(), "ACGT ")
        self.assertRaises(TypeError, s.lstrip, 7)
        self.assertEqual(s, " ACGT ")
        self.assertEqual(m.lstrip(), "ACGT ")
        self.assertRaises(TypeError, m.lstrip, 7)
        self.assertEqual(m, " ACGT ")
        self.assertEqual(m.lstrip(inplace=True), "ACGT ")
        self.assertEqual(m, "ACGT ")

    def test_str_rstrip(self):
        """Check matches the python string rstrip method."""
        self._test_method("rstrip")
        s = Seq(" ACGT ")
        m = MutableSeq(" ACGT ")
        self.assertEqual(s.rstrip(), " ACGT")
        self.assertRaises(TypeError, s.rstrip, 7)
        self.assertEqual(s, " ACGT ")
        self.assertEqual(m.rstrip(), " ACGT")
        self.assertRaises(TypeError, m.rstrip, 7)
        self.assertEqual(m, " ACGT ")
        self.assertEqual(m.rstrip(inplace=True), " ACGT")
        self.assertEqual(m, " ACGT")

    def test_str_split(self):
        """Check matches the python string split method."""
        self._test_method("split")
        self.assertEqual(Seq("AC7GT").split("7"), "AC7GT".split("7"))
        self.assertRaises(TypeError, Seq("AC7GT").split, 7)
        self.assertEqual(MutableSeq("AC7GT").split("7"), "AC7GT".split("7"))
        self.assertRaises(TypeError, MutableSeq("AC7GT").split, 7)

    def test_str_rsplit(self):
        """Check matches the python string rsplit method."""
        self._test_method("rsplit")
        self.assertEqual(Seq("AC7GT").rsplit("7"), "AC7GT".rsplit("7"))
        self.assertRaises(TypeError, Seq("AC7GT").rsplit, 7)
        self.assertEqual(MutableSeq("AC7GT").rsplit("7"), "AC7GT".rsplit("7"))
        self.assertRaises(TypeError, MutableSeq("AC7GT").rsplit, 7)

    def test_str_length(self):
        """Check matches the python string __len__ method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(len(example1), len(str1))

    def test_str_upper(self):
        """Check matches the python string upper method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(example1.upper(), str1.upper())

    def test_str_lower(self):
        """Check matches the python string lower method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(example1.lower(), str1.lower())

    def test_str_encode(self):
        """Check matches the python string encode method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(bytes(example1), str1.encode("ascii"))

    def test_str_hash(self):
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            with warnings.catch_warnings():
                # Silence change in behaviour warning
                warnings.simplefilter("ignore", BiopythonWarning)
                self.assertEqual(
                    hash(str(example1)),
                    hash(example1),
                    "Hash mismatch, %r for %r vs %r for %r" %
                    (hash(str(example1)), id(example1), hash(example1),
                     example1),
                )

    def test_str_comparison(self):
        for example1 in self._examples:
            for example2 in self._examples:
                with warnings.catch_warnings():
                    self.assertEqual(
                        str(example1) == str(example2),
                        example1 == example2,
                        "Checking %r == %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) != str(example2),
                        example1 != example2,
                        "Checking %r != %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) < str(example2),
                        example1 < example2,
                        "Checking %r < %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) <= str(example2),
                        example1 <= example2,
                        "Checking %r <= %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) > str(example2),
                        example1 > example2,
                        "Checking %r > %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) >= str(example2),
                        example1 >= example2,
                        "Checking %r >= %r" % (example1, example2),
                    )

    def test_str_getitem(self):
        """Check slicing and indexing works like a string."""
        for example1 in self._examples:
            str1 = str(example1)
            for i in self._start_end_values:
                if i is not None and abs(i) < len(example1):
                    self.assertEqual(example1[i], str1[i])
                self.assertEqual(example1[:i], str1[:i])
                self.assertEqual(example1[i:], str1[i:])
                for j in self._start_end_values:
                    self.assertEqual(example1[i:j], str1[i:j])
                    for step in range(-3, 4):
                        if step == 0:
                            with self.assertRaises(ValueError) as cm:
                                example1[i:j:step]
                            self.assertEqual(str(cm.exception),
                                             "slice step cannot be zero")
                        else:
                            self.assertEqual(example1[i:j:step],
                                             str1[i:j:step])

    def test_tomutable(self):
        """Check creating a MutableSeq object."""
        for example1 in self._examples:
            mut = MutableSeq(example1)
            self.assertIsInstance(mut, MutableSeq)
            self.assertEqual(mut, example1)

    def test_toseq(self):
        """Check creating a Seq object."""
        for example1 in self._examples:
            seq = Seq(example1)
            self.assertIsInstance(seq, Seq)
            self.assertEqual(seq, example1)

    def test_the_complement(self):
        """Check obj.complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            comp = example1.complement()
            str1 = str(example1)
            if "U" in str1 or "u" in str1:
                mapping = str.maketrans("ACGUacgu", "UGCAugca")
            else:
                # Default to DNA, e.g. complement("A") -> "T" not "U"
                mapping = str.maketrans("ACGTacgt", "TGCAtgca")
            self.assertEqual(str1.translate(mapping), comp)

    def test_the_reverse_complement(self):
        """Check obj.reverse_complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            comp = example1.reverse_complement()
            str1 = str(example1)
            if "U" in str1 or "u" in str1:
                mapping = str.maketrans("ACGUacgu", "UGCAugca")
            else:
                # Defaults to DNA, so reverse_complement("A") --> "T" not "U"
                mapping = str.maketrans("ACGTacgt", "TGCAtgca")
            self.assertEqual(str1.translate(mapping)[::-1], comp)

    def test_the_transcription(self):
        """Check obj.transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            tran = example1.transcribe()
            str1 = str(example1)
            if len(str1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            self.assertEqual(str1.replace("T", "U").replace("t", "u"), tran)

    def test_the_back_transcription(self):
        """Check obj.back_transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            tran = example1.back_transcribe()
            str1 = str(example1)
            self.assertEqual(str1.replace("U", "T").replace("u", "t"), tran)

    def test_the_translate(self):
        """Check obj.translate() method."""
        mapping = ""
        for example1 in self._examples:
            if len(example1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            tran = example1.translate()
            # Try with positional vs named argument:
            self.assertEqual(example1.translate(11),
                             example1.translate(table=11))

            # TODO - check the actual translation, and all the optional args

    def test_the_translation_of_stops(self):
        """Check obj.translate() method with stop codons."""
        misc_stops = "TAATAGTGAAGAAGG"
        nuc = Seq(misc_stops)
        self.assertEqual("***RR", nuc.translate())
        self.assertEqual("***RR", nuc.translate(1))
        self.assertEqual("***RR", nuc.translate("SGC0"))
        self.assertEqual("**W**", nuc.translate(table=2))
        self.assertEqual("**WRR", nuc.translate(table="Yeast Mitochondrial"))
        self.assertEqual("**WSS", nuc.translate(table=5))
        self.assertEqual("**WSS", nuc.translate(table=9))
        self.assertEqual("**CRR", nuc.translate(table="Euplotid Nuclear"))
        self.assertEqual("***RR", nuc.translate(table=11))
        self.assertEqual("***RR", nuc.translate(table="11"))
        self.assertEqual("***RR", nuc.translate(table="Bacterial"))
        self.assertEqual("**GRR", nuc.translate(table=25))
        self.assertEqual("", nuc.translate(to_stop=True))
        self.assertEqual("O*ORR", nuc.translate(table=special_table))
        self.assertEqual("*QWRR",
                         nuc.translate(table=Chilodonella_uncinata_table))
        nuc = MutableSeq(misc_stops)
        self.assertEqual("***RR", nuc.translate())
        self.assertEqual("***RR", nuc.translate(1))
        self.assertEqual("***RR", nuc.translate("SGC0"))
        self.assertEqual("**W**", nuc.translate(table=2))
        self.assertEqual("**WRR", nuc.translate(table="Yeast Mitochondrial"))
        self.assertEqual("**WSS", nuc.translate(table=5))
        self.assertEqual("**WSS", nuc.translate(table=9))
        self.assertEqual("**CRR", nuc.translate(table="Euplotid Nuclear"))
        self.assertEqual("***RR", nuc.translate(table=11))
        self.assertEqual("***RR", nuc.translate(table="11"))
        self.assertEqual("***RR", nuc.translate(table="Bacterial"))
        self.assertEqual("**GRR", nuc.translate(table=25))
        self.assertEqual("", nuc.translate(to_stop=True))
        self.assertEqual("O*ORR", nuc.translate(table=special_table))
        self.assertEqual("*QWRR",
                         nuc.translate(table=Chilodonella_uncinata_table))
        # These test the Bio.Seq.translate() function - move these?:
        self.assertEqual(
            "*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table))
        self.assertEqual("O*ORR", translate(str(nuc), table=special_table))
        self.assertEqual("", translate(str(nuc), to_stop=True))
        self.assertEqual("***RR", translate(str(nuc), table="Bacterial"))
        self.assertEqual("***RR", translate(str(nuc), table="11"))
        self.assertEqual("***RR", translate(str(nuc), table=11))
        self.assertEqual("**W**", translate(str(nuc), table=2))
        self.assertEqual(Seq("TAT").translate(), "Y")
        self.assertEqual(Seq("TAR").translate(), "*")
        self.assertEqual(Seq("TAN").translate(), "X")
        self.assertEqual(Seq("NNN").translate(), "X")
        self.assertEqual(Seq("TAt").translate(), "Y")
        self.assertEqual(Seq("TaR").translate(), "*")
        self.assertEqual(Seq("TaN").translate(), "X")
        self.assertEqual(Seq("nnN").translate(), "X")
        self.assertEqual(Seq("tat").translate(), "Y")
        self.assertEqual(Seq("tar").translate(), "*")
        self.assertEqual(Seq("tan").translate(), "X")
        self.assertEqual(Seq("nnn").translate(), "X")

    def test_the_translation_of_invalid_codons(self):
        """Check obj.translate() method with invalid codons."""
        for codon in ["TA?", "N-N", "AC_", "Ac_"]:
            msg = "Translating %s should fail" % codon
            nuc = Seq(codon)
            with self.assertRaises(TranslationError, msg=msg):
                nuc.translate()
            nuc = MutableSeq(codon)
            with self.assertRaises(TranslationError, msg=msg):
                nuc.translate()

    def test_the_translation_of_ambig_codons(self):
        """Check obj.translate() method with ambiguous codons."""
        for ambig_values in [ambiguous_dna_values, ambiguous_rna_values]:
            ambig = set(ambig_values.keys())
            ambig.remove("X")
            for c1 in ambig:
                for c2 in ambig:
                    for c3 in ambig:
                        values = {
                            str(Seq(a + b + c).translate())
                            for a in ambig_values[c1] for b in ambig_values[c2]
                            for c in ambig_values[c3]
                        }
                        t = Seq(c1 + c2 + c3).translate()
                        if t == "*":
                            self.assertEqual(values, set("*"))
                        elif t == "X":
                            self.assertGreater(
                                len(values),
                                1,
                                "translate('%s') = '%s' not '%s'" %
                                (c1 + c2 + c3, t, ",".join(values)),
                            )
                        elif t == "Z":
                            self.assertEqual(values, set("EQ"))
                        elif t == "B":
                            self.assertEqual(values, set("DN"))
                        elif t == "J":
                            self.assertEqual(values, set("LI"))
                        else:
                            self.assertEqual(values, set(t))
                        # TODO - Use the Bio.Data.IUPACData module for the
                        # ambiguous protein mappings?

    def test_init_typeerror(self):
        """Check Seq __init__ gives TypeError exceptions."""
        self.assertRaises(TypeError, Seq, ("A", "C", "G", "T"))
        self.assertRaises(TypeError, Seq, ["A", "C", "G", "T"])
        self.assertRaises(TypeError, Seq, 1)
        self.assertRaises(TypeError, Seq, 1.0)

    def test_MutableSeq_init_typeerror(self):
        """Check MutableSeq __init__ gives TypeError exceptions."""
        self.assertRaises(TypeError, MutableSeq, ("A", "C", "G", "T"))
        self.assertRaises(TypeError, MutableSeq, ["A", "C", "G", "T"])
        self.assertRaises(TypeError, MutableSeq, 1)
        self.assertRaises(TypeError, MutableSeq, 1.0)

    def test_join_Seq_TypeError(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = Seq("NNNNN")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_UnknownSeq_TypeError_iter(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = UnknownSeq(5, character="-")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_MutableSeq_TypeError_iter(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = MutableSeq("MMMMM")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_Seq(self):
        """Checks if Seq join correctly concatenates sequence with the spacer."""
        spacer = Seq("NNNNN")
        self.assertEqual(
            "N" * 15,
            spacer.join([Seq("NNNNN"), Seq("NNNNN")]),
        )

        spacer1 = Seq("")
        spacers = [spacer1, Seq("NNNNN"), Seq("GGG")]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str_concatenated, "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(seq_concatenated,
                             str(spacer).join(example_strings))
            # Now try single sequence arguments, should join the letters
            for target in example_strings + example_strings_seqs:
                self.assertEqual(
                    str(spacer).join(str(target)), str(spacer.join(target)))

    def test_join_UnknownSeq(self):
        """Checks if UnknownSeq join correctly concatenates sequence with the spacer."""
        spacer1 = UnknownSeq(5, character="-")
        spacer2 = UnknownSeq(0, character="-")
        spacers = [spacer1, spacer2]

        self.assertEqual(
            "-" * 15,
            spacer1.join(
                [UnknownSeq(5, character="-"),
                 UnknownSeq(5, character="-")]),
        )
        self.assertEqual(
            "N" * 5 + "-" * 10,
            spacer1.join([Seq("NNNNN"),
                          UnknownSeq(5, character="-")]),
        )

        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer2.join(example_strings)

        self.assertEqual(str_concatenated, "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(seq_concatenated,
                             str(spacer).join(example_strings))
            # Now try single sequence arguments, should join the letters
            for target in example_strings + example_strings_seqs:
                self.assertEqual(
                    str(spacer).join(str(target)), str(spacer.join(target)))

    def test_join_MutableSeq_mixed(self):
        """Check MutableSeq objects can be joined."""
        spacer = MutableSeq("NNNNN")
        self.assertEqual(
            "N" * 15,
            spacer.join([MutableSeq("NNNNN"),
                         MutableSeq("NNNNN")]),
        )
        self.assertRaises(
            TypeError,
            spacer.join([Seq("NNNNN"), MutableSeq("NNNNN")]),
        )

    def test_join_Seq_with_file(self):
        """Checks if Seq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = Seq("NNNNN")
        spacer1 = Seq("")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(seq_concatenated, ref_data)
        self.assertEqual(seq_concatenated1, ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_join_UnknownSeq_with_file(self):
        """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = UnknownSeq(0, character="-")
        spacer1 = UnknownSeq(5, character="-")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(seq_concatenated, ref_data)
        self.assertEqual(seq_concatenated1, ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_join_MutableSeq(self):
        """Checks if MutableSeq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = MutableSeq("")
        spacers = [
            spacer1,
            MutableSeq("NNNNN"),
            MutableSeq("GGG"),
        ]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str_concatenated, "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(seq_concatenated,
                             str(spacer).join(example_strings))

    def test_join_MutableSeq_with_file(self):
        """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = MutableSeq("NNNNN")
        spacer1 = MutableSeq("")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(seq_concatenated, ref_data)
        self.assertEqual(seq_concatenated1, ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_equality(self):
        """Test equality when mixing types."""
        self.assertEqual(Seq("6"), "6")
        self.assertNotEqual(Seq("6"), 6)
        self.assertEqual(Seq(""), "")
        self.assertNotEqual(Seq(""), None)
        self.assertEqual(Seq("None"), "None")
        self.assertNotEqual(Seq("None"), None)

        self.assertEqual(MutableSeq("6"), "6")
        self.assertNotEqual(MutableSeq("6"), 6)
        self.assertEqual(MutableSeq(""), "")
        self.assertNotEqual(MutableSeq(""), None)
        self.assertEqual(MutableSeq("None"), "None")
        self.assertNotEqual(MutableSeq("None"), None)

        self.assertEqual(UnknownSeq(1, character="6"), "6")
        self.assertNotEqual(UnknownSeq(1, character="6"), 6)
        self.assertEqual(UnknownSeq(0), "")
        self.assertNotEqual(UnknownSeq(0), None)
Ejemplo n.º 12
0
 def setUp(self):
     alphabet = TestAlphabet()
     test_genome = MutableSeq("11*22*33*", alphabet)
     self.organism = Organism(test_genome, test_fitness)
     
     self.ambig_info = Schema(alphabet.alphabet_matches)
Ejemplo n.º 13
0
with open(args.rc_regions) as infile1:
	RCseqs = csv.reader(infile1, delimiter='\t')
	for row in RCseqs:
		seq = row[0]
		RCstart = row[1]
		RCstop = row[2]
		seq_list.append([seq, RCstart, RCstop])

# Read fasta file:
fasta_seqs = list(SeqIO.parse(args.fasta, "fasta"))

# Mask recombinant regions:
masked_seq = []

for i in fasta_seqs:
    seq = MutableSeq(str(i.seq))
    for j in seq_list:
        if i.id == j[0]: # j[0] is the sequence id in recombinant regions list
            start_mask = int(j[1]) - 1 # 1 based positions are 1 less in 0-based indexing
            end_mask = int(j[2]) # last index in range is not included in python
            len_mask = end_mask - start_mask
            seq[start_mask:end_mask] = args.maskchar * len_mask

    masked_seq.append(SeqRecord(Seq(str(seq)), i.id, description=""))

for i in masked_seq:
    print("Number of characters masked in sequence " + i.id + ": " + str(str(i.seq).count(args.maskchar)))

# Write masked sequences to file:
SeqIO.write(masked_seq, args.out, "fasta")
Ejemplo n.º 14
0
def MutableSeqFromFile(filename, alphabet):
    sequence_str = open(filename).read().strip()
    return MutableSeq(sequence_str.lower(), alphabet)
Ejemplo n.º 15
0
def count_one_fraction(alignment, refname, debug, start_offset, end_trail):
    """
    Don't bother with expected/allowed mutations, just find everything and filter later
    Final format: {DNA error: [(protein error), fraction,
    1. Read reference file
    2. Scan over reference sequence to generate all possible mutations
    3. For each ref & read in multiple alignment:
        - verify the read is good quality
        - call the mutation
        - add to count table
    4. Print counts
    """
    # use a regular dictionary
    # when a protein mutation is first encountered, create an entry
    one_lane_counts = {}

    # reading & looping over read/reference sequence in multiple sequence alignment
    # use AlignIO parser and keep sequence only, allowing it to change (important for gap shifts)
    for pair in AlignIO.parse(alignment,
                              "fasta",
                              alphabet=IUPAC.ambiguous_dna,
                              seq_count=2):
        # both read and ref are MutableSeq
        ref = pair[0].seq.tomutable()
        read = pair[1].seq.tomutable()
        read = MutableSeq(str(read).replace('N', '.'), read.alphabet)
        readname = pair[1].id

        # trim sequencing read to reference
        ref, read = trim_read(ref, read)

        # if read_is_wt(read, ref):
        #     if debug:
        #         trimmed_read = re.search(r'^-+([AGCTN][ACGTN-]+[ACGTN])-+$', str(read))
        #         print()
        #         print(trimmed_read.group(1))
        #         printErrors("WT", read, ref, True)
        #     continue

        dna_errors, dna_hgvs, prot_errors = None, None, None

        try:
            dna_errors = find_DNA_diff(read, ref, debug, start_offset,
                                       end_trail)  # errors = a tuple
            dna_hgvs = find_DNA_hgvs(
                read, ref, refname, debug, start_offset,
                end_trail)  # string according to HGVS format (ish)
            prot_errors = find_protein_diff(read, ref, debug, start_offset,
                                            end_trail)
            # print()
            # print(readname)
            # print(dna_hgvs, prot_errors)
            # printErrors(dna_errors, read, ref, True)

        except:
            if not dna_errors:
                print(dna_errors)
            print_coloured_diff(readname, read, ref, debug)
            raise

        try:
            one_lane_counts[prot_errors]['total'] += 1
            one_lane_counts[prot_errors]['dna'][dna_errors] += 1
            one_lane_counts[prot_errors]['dna_hgvs'][dna_hgvs] += 1
        except KeyError:
            one_lane_counts[prot_errors] = {
                'dna': defaultdict(int),
                'dna_hgvs': defaultdict(int),
                'total': 1
            }
            one_lane_counts[prot_errors]['dna'][dna_errors] += 1
            one_lane_counts[prot_errors]['dna_hgvs'][dna_hgvs] += 1

    # count the mutations
    n = 0
    threshold = 10
    for error in one_lane_counts.keys():
        if one_lane_counts[error]['total'] > threshold:
            n += 1

    print(
        'Fount {0} total protein mutations, of which {1} have more than {2} counts'
        .format(len(one_lane_counts), n, threshold))

    return one_lane_counts
Ejemplo n.º 16
0
 def __init__(self, seqFile, format="fasta"):
     for seq in SeqIO.parse(seqFile, format):
         seq.seq = MutableSeq(seq.seq.tostring())
         self.append(seq)
Ejemplo n.º 17
0
 def test_generated(self):
     """Write and read back odd SeqRecord objects."""
     record1 = SeqRecord(
         Seq("ACGT" * 500, generic_dna),
         id="Test",
         description="Long " * 500,
         letter_annotations={"phred_quality": [40, 30, 20, 10] * 500},
     )
     record2 = SeqRecord(
         MutableSeq("NGGC" * 1000),
         id="Mut",
         description="very " * 1000 + "long",
         letter_annotations={"phred_quality": [0, 5, 5, 10] * 1000},
     )
     record3 = SeqRecord(
         UnknownSeq(2000, character="N"),
         id="Unk",
         description="l" + ("o" * 1000) + "ng",
         letter_annotations={"phred_quality": [0, 1] * 1000},
     )
     record4 = SeqRecord(
         Seq("ACGT" * 500),
         id="no_descr",
         description="",
         name="",
         letter_annotations={"phred_quality": [40, 50, 60, 62] * 500},
     )
     record5 = SeqRecord(
         Seq("", generic_dna),
         id="empty_p",
         description="(could have been trimmed lots)",
         letter_annotations={"phred_quality": []},
     )
     record6 = SeqRecord(
         Seq(""),
         id="empty_s",
         description="(could have been trimmed lots)",
         letter_annotations={"solexa_quality": []},
     )
     record7 = SeqRecord(
         Seq("ACNN" * 500),
         id="Test_Sol",
         description="Long " * 500,
         letter_annotations={"solexa_quality": [40, 30, 0, -5] * 500},
     )
     record8 = SeqRecord(
         Seq("ACGT"),
         id="HighQual",
         description=
         "With very large qualities that even Sanger FASTQ can't hold!",
         letter_annotations={"solexa_quality": [0, 10, 100, 1000]},
     )
     # TODO - Record with no identifier?
     records = [
         record1,
         record2,
         record3,
         record4,
         record5,
         record6,
         record7,
         record8,
     ]
     for format in [
             "fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"
     ]:
         handle = StringIO()
         with warnings.catch_warnings():
             # TODO - Have a Biopython defined "DataLossWarning?"
             warnings.simplefilter("ignore", BiopythonWarning)
             SeqIO.write(records, handle, format)
         handle.seek(0)
         compare_records(records, list(SeqIO.parse(handle, format)),
                         truncation_expected(format))
Ejemplo n.º 18
0
	def __init__(self, seq, alphaproperty=None, insertprob=None,
		     deleteprob=None, mualphabet=None,
		     muprob=None, mupos=None, delpos=None, inpos=None,
		     verbose=False):
		try:
			self.occureddel = list()  # This is to keep a history of chnges made to the reference
			self.occuredmu = list()  # This is necessary for writing the haplotypes in the format
			self.occuredins = list()  # of haplotyping software's.
			self.inserted_allele = list()  # keeps track of the inserted allele to be able to get them back when needed!
			self.alt_allele = list()  # keeps track of the substituted
			if not isinstance(verbose, bool):
				raise CustomException("ERROR: verbose must be set to either True or False. \
Default is to False")
			else:
				self.verbose = verbose
			if isinstance(seq, str):
				if alphaproperty is None:
					if self.verbose:
						print(
							"WARNING: No alphabet type is specified for the sequence string!")
					else:
						pass
					self.alphaproperty = Alphabet()
				else:
					self.alphaproperty = alphaproperty
				self.seq = MutableSeq(seq, self.alphaproperty)
			elif isinstance(seq, Seq):
				self.alphaproperty = seq.__getattribute__(
					'alphabet')
				self.seq = seq.tomutable()
			elif isinstance(seq, MutableSeq):
				self.alphaproperty = seq.__getattribute__(
					'alphabet')
				self.seq = copy.deepcopy(seq)
			else:
				raise CustomException("ERROR: Should provide a Seq or MutableSeq object, \n \
or a string sequence!")
			self.alphabet = set(str(self.seq))
			self.ref = str(self.seq)
			if not delpos:
				self.delpos = []
			else:
				if set(delpos).issubset(
					set(range(len(self.ref)))):
					self.delpos = list(
						delpos)  # Deletion by specifying the positions
				else:
					raise CustomException(
						"ERROR: Deletion positions exceed the range of the reference or are not positive integers!")
			if not inpos:
				self.inpos = []
			else:
				if set(inpos).issubset(
					set(range(len(self.ref)))):
					self.inpos = list(
						inpos)  # Insertion by specifying the positions
				else:
					raise CustomException(
						"ERROR: Insertion positions exceed the range of the reference or are not positive integers!")
			if not mupos:
				self.mupos = []
			else:
				if set(mupos).issubset(
					set(range(len(self.ref)))):
					self.mupos = list(
						mupos)  # Mutation by specifying the positions
				else:
					raise CustomException(
						"ERROR: Mutation positions exceed the range of the reference or are not positive integers!")
			if not mualphabet:
				if self.verbose:
					print("WARNING: You have specified no mutation alphabet! Mutations are set to random \
letters!")
				self.mualphabet = dict()
				for key in self.alphabet:
					self.mualphabet[key] = ''.join(
						self.alphabet - {
						key,'N'})  # Non-specified mutations could happen to any letter
			else:
				mualphabet = dict([(str(k), str(v)) for k, v in
						   mualphabet.iteritems()])
				for key, value in mualphabet.iteritems():
					if len(key) != 1:
						raise CustomException("ERROR: the mutation alphabet deals with point mutations! Only single letters are\
 allowed as keys!")
					elif key in set(''.join(value)):
						raise CustomException("ERROR: Wrong mutation values specified! A letter could just be substituted with a\
 different letter for mutation!")
				if set(
					mualphabet.keys()) == self.alphabet and set(
					''.join(
						mualphabet.values())) <= self.alphabet:
					self.mualphabet = copy.deepcopy(
						mualphabet)
				elif set(
					mualphabet.keys()) < self.alphabet and set(
					''.join(
						mualphabet.values())) < self.alphabet:
					if self.verbose:
						print("WARNING: Mutation is not specified for some letters! Those mutations are set\
 to random letters!")
					self.mualphabet = copy.deepcopy(
						mualphabet)  # Whatever has been specified for mutation alphabet is kep intact
					for key in self.alphabet - set(
						mualphabet.keys()):
						self.mualphabet[key] = ''.join(
							self.alphabet - {
							key,'N'})  # Non-specified mutations could happen to any letter
				else:
					if self.verbose:
						print("WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\
 updated and\nunspecified mutations are set to random letters!")
					new_mualphabet = dict()  # As mutation may introduce novel alleles in the sequence, alphabet is updated first
					for key, value in mualphabet.iteritems():  # Whatever has been specified for mutation alphabet is kep intact
						self.alphabet.add(
							key)  # Only the alphabet is updated if necessary
						self.alphabet |= (set(''.join(
							value)) - self.alphabet)
						new_mualphabet.update(
							{key: value})
					for key in self.alphabet - set(
						new_mualphabet.keys()):
						new_mualphabet[key] = ''.join(
							self.alphabet - {
							key,'N'})  # Non-specified mutations could happen to any letter
					self.mualphabet = copy.deepcopy(
						new_mualphabet)
			if not insertprob:
				self.insertprob = dict()  # If no insertprob is given, it is set to zero everywhere
				for key in self.alphabet:
					self.insertprob[key] = 0
			else:
				if set(list(
					insertprob.keys())) != self.alphabet:
					if self.verbose:
						print("WARNING: Missing/Invalid letter(s) in insertion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_insertprob = dict()
				for key, value in insertprob.iteritems():
					if value >= 0 and value <= 1:
						new_insertprob.update(
							{key: value})
					else:
						raise CustomException(
							"ERROR: Insertion probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_insertprob.keys()):
					new_insertprob[key] = 0
				self.insertprob = copy.deepcopy(new_insertprob)
			if not deleteprob:  # If no deleteprob is given, it is set to zero everywhere
				self.deleteprob = dict()
				for key in self.alphabet:
					self.deleteprob[key] = 0
			else:
				if set(list(
					deleteprob.keys())) != self.alphabet:
					if self.verbose:
						print("WARNING: Missing/Invalid letter(s) in deletion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_deleteprob = dict()
				for key, value in deleteprob.iteritems():
					if value >= 0 and value <= 1:
						new_deleteprob.update(
							{key: value})
					else:
						raise CustomException(
							"ERROR: Deletion probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_deleteprob.keys()):
					new_deleteprob[key] = 0
				self.deleteprob = copy.deepcopy(new_deleteprob)
			if not muprob:
				self.muprob = dict()  # If no muprob is given, it is set to zero everywhere
				for key in self.alphabet:
					self.muprob[key] = 0
			else:
				if set(list(muprob.keys())) != self.alphabet:
					if self.verbose:
						print("WARNING: Missing/Invalid letter(s) in mutation probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_muprob = dict()
				for key, value in muprob.iteritems():
					if value >= 0 and value <= 1:
						new_muprob.update({key: value})
					else:
						raise CustomException(
							"ERROR: Mutation probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_muprob.keys()):
					new_muprob[key] = 0
				self.muprob = copy.deepcopy(new_muprob)
		except CustomException as instance:
			print(instance)
			sys.exit(2)
		else:
			if self.verbose:
				print(
					"MuGen object successfully created.\nWARNING: MuGen sequence is case sensitive!")
Ejemplo n.º 19
0
else:
    print("huh?  ERROR")

t = Seq.Seq("T", IUPAC.ambiguous_dna)
u = s + t
print(str(u.alphabet))

from Bio.Seq import MutableSeq
import array

print
print("Testing MutableSeq")
print("==================")

print("Testing creating MutableSeqs in multiple ways")
string_seq = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna)
array_seq = MutableSeq(array.array(array_indicator, "TCAAAAGGATGCATCATG"),
                       IUPAC.ambiguous_dna)
converted_seq = s.tomutable()

for test_seq in [string_seq]:
    print(repr(test_seq))
    print(str(test_seq))
    print(len(test_seq))
    print(repr(test_seq.toseq()))

    print(test_seq[0])
    print(repr(test_seq[1:5]))

    test_seq[1:3] = "GAT"
    print "Set slice with string:", repr(test_seq)
Ejemplo n.º 20
0
	def probmu(self):
		self.occuredmu = list()
		self.occureddel = list()
		self.occuredins = list()
		self.inserted_allele = list()
		self.alt_allele = list()
		"""Operates on a MuGen object, and returns a Seq object obtained by making random changes
		to the reference sequence of the MuGen object, using the probabilities given to MuGen"""
		self.seq = []
		for __site, __base in enumerate(self.ref):
			if __site in set(self.mupos) | set(self.inpos) | set(
				self.delpos):
				self.seq.append(
					__base)  # No change is made at indel/mutation positions
			else:
				__prob = {'ins': self.insertprob.get(__base),
					  'del': self.deleteprob.get(__base),
					  'sub': self.muprob.get(__base)}
				__error = random.choice(['ins', 'del', 'sub',
							 'sub'])  # An error occurs randomly: insertion or \
				# deletion or substitution
				__rnd = float(int(
					random.random() * 100000)) / 100000  # The probability that this error is \
				# not corrected by replication machinary is determined \
				if __rnd < __prob.get(
					__error):  # by insertprob,deleteprob and muprob
					if __error == 'sub':
						self.seq.append(random.choice(
							self.mualphabet.get(
								__base)))  # Substitute tha letter with one from the mutation alphabet
						self.occuredmu.append(
							__site)  # Update the list of the sites where a mutation has occured
						self.alt_allele.extend([
									       self.seq[
										       -1]])  # Update the list of alternative alleles
					elif __error == 'ins':
						self.seq.append(__base)
						self.seq.append(random.choice(
							list(
								self.alphabet)))  # Insert a random letter right after the letter
						self.occuredins.append(
							__site)  # Update the list of the sites after which an insertion has occured
						self.inserted_allele.extend([
										    __base +
										    self.seq[
											    -1]])  # Update the list of inserted alleles
					else:
						self.occureddel.append(
							__site)  # Delete the letter in the progeny sequence by just not adding it
				else:  # Update the list of the sites which are deleted in the progeny sequence
					self.seq.append(
						__base)  # No change is induced at the site in the progeny sequence
		self.seq = ''.join(self.seq)
		self.seq = MutableSeq(self.seq, self.alphaproperty)
		if (self.occuredins):
			_ins_allele = zip(self.occuredins,
					  self.inserted_allele)
			_ins_allele.sort(key=lambda tup: tup[
				0])  # Sort the occured change positions in ascending order
			self.occuredins, self.inserted_allele = zip(
				*_ins_allele)
			self.occuredins = list(self.occuredins)
			self.inserted_allele = list(self.inserted_allele)
			_ins_allele = None
		else:
			self.inserted_allele = []
			self.occuredins = []
		if (self.occuredmu):
			_alt_allele = zip(self.occuredmu, self.alt_allele)
			_alt_allele.sort(key=lambda tup: tup[0])
			self.occuredmu, self.alt_allele = zip(*_alt_allele)
			self.occuredmu = list(self.occuredmu)
			self.alt_allele = list(self.alt_allele)
			_alt_allele = None
		else:
			self.occuredmu = []
			self.alt_allele = []
		if (self.occureddel):
			self.occureddel.sort()
		else:
			self.occureddel = []
		if self.verbose:
			print("WARNING: If indel/mutation positions are specified, MuGen.probmu() makes no change at those sites. \n \
Use MuGen.posmu() or Mugen.hapchanger() to apply changes at those sites!")
			print("Changes made to the haplotype!")
Ejemplo n.º 21
0
import gffutils
import rjvbio.seq
import Bio.SeqIO, Bio.SeqRecord
from Bio.Seq import MutableSeq
from Bio.Seq import Seq

if conf.out == 'STDOUT': fout = sys.stdout
else: fout = open(conf.out, 'wb')

db = gffutils.FeatureDB(conf.gffdb)

if conf.featuretypes == 'ALL': conf.featuretypes = None

for rec in Bio.SeqIO.parse(conf.inpfasta, 'fasta'):
    seqid = rec.id.strip()
    seq = MutableSeq(str(rec.seq))
    length = len(seq)

    for feature in db.all_features(limit=[seqid, 0, length],
                                   completely_within=False,
                                   featuretype=conf.featuretypes):
        start = min(feature.start - 1, feature.end)
        end = max(feature.start - 1, feature.end)
        flength = end - start
        seq[start:end] = 'N' * flength
        assert len(seq) == length

    newrec = Bio.SeqRecord.SeqRecord(seq, id=seqid, description='')
    Bio.SeqIO.write(newrec, fout, "fasta")

if conf.out != 'STDOUT':
Ejemplo n.º 22
0
	def posmu(self):
		"""Operates on a MuGen object, and returns a Seq object obtained by making specefic changes
		at specefic locations on the reference sequence of the MuGen object, using the
		indel and mutation positions already given to MuGen"""
		__change = [None] * len(self.ref)
		self.occuredmu = list()
		self.occureddel = list()
		self.occuredins = list()
		self.inserted_allele = list()  # Preservation and change site are determined
		self.alt_allele = list()
		for __site in self.inpos:  # Preservation and change site are determined
			__change[
				__site] = 'ins'  # with respect to the reference seq
		for __site in self.delpos:  # type of the change is also specified
			__change[__site] = 'del'  # The substituion base at the
		for __site in self.mupos:  # specified position is determined
			__change[__site] = 'sub'  # from the mutation alphabet.
		self.seq = []
		for __site, __error in iter(
			zip(range(len(self.ref)), __change)):
			__base = self.ref[__site]
			if __error is None:
				self.seq.append(__base)
			elif __error == 'sub':
				self.seq.append(random.choice(
					self.mualphabet.get(
						__base)))  # Substitute tha letter with one from the mutation alphabet
				self.occuredmu.append(
					__site)  # Update the list of the sites where a mutation has occured
				self.alt_allele.extend([self.seq[
								-1]])  # Update the list of alternative alleles
			elif __error == 'ins':
				self.seq.append(__base)
				self.seq.append(random.choice(list(
					self.alphabet)))  # Insert a random letter right after the letter
				self.occuredins.append(
					__site)  # Update the list of the sites after which an insertion has occured
				self.inserted_allele.extend([__base + self.seq[
					-1]])  # Update the list of inserted alleles
			else:
				self.occureddel.append(
					__site)  # Delete the letter in the progeny sequence by just not adding it
		self.seq = ''.join(self.seq)
		self.seq = MutableSeq(self.seq,
				      self.alphaproperty)  # Update the list of the sites which are deleted in the progeny sequence
		if self.occuredins:
			_ins_allele = zip(self.occuredins,
					  self.inserted_allele)
			_ins_allele.sort(key=lambda tup: tup[
				0])  # Sort the occured change positions
			self.occuredins, self.inserted_allele = zip(
				*_ins_allele)
			self.occuredins = list(self.occuredins)
			self.inserted_allele = list(self.inserted_allele)
			_ins_allele = None
		else:
			self.inserted_allele = []
			self.occuredins = []
		if (self.occuredmu):
			_alt_allele = zip(self.occuredmu, self.alt_allele)
			_alt_allele.sort(key=lambda tup: tup[0])
			self.occuredmu, self.alt_allele = zip(*_alt_allele)
			self.occuredmu = list(self.occuredmu)
			self.alt_allele = list(self.alt_allele)
			_alt_allele = None
		else:
			self.occuredmu = []
			self.alt_allele = []
		if (self.occureddel):
			self.occureddel.sort()
		else:
			self.occureddel = []
		if self.verbose:
			print("WARNING: if there are overlaps betweeen deletion, insertion and mutation positions, \n \
just one of the changes takes place with the following priority: \n \
1)Mutation  2)Deletion 3)Insertion. \n")
			print("Changes made to the haplotype!")
Ejemplo n.º 23
0
    def setUp(self):
        self.alphabet = TestAlphabet()
        genome = MutableSeq("2", self.alphabet)
        self.org = Organism(genome, test_fitness)

        self.test_mutator = TestMutator()
Ejemplo n.º 24
0
	def hapchanger(self):
		"""Operates on a MuGen object, and returns a Seq object obtained by making random and specified
		changes to the reference sequence of the MuGen object, using the probabilities as well as the
		positions given to MuGen."""
		self.seq = []
		self.occuredmu = list()
		self.occureddel = list()
		self.occuredins = list()
		self.inserted_allele = list()
		self.alt_allele = list()
		for __site, __base in enumerate(self.ref):
			if __site in set(
				self.mupos):  # Making specified changes at the specified positions
				self.seq.append(random.choice(
					self.mualphabet.get(
						__base)))  # Induce mutation at the site whose position is given
				self.occuredmu.append(
					__site)  # Update the list of the sites where a mutation has occured
				self.alt_allele.extend([self.seq[
								-1]])  # Update the list of alternative alleles
			elif __site in set(self.inpos):
				self.seq.append(
					__base)  # Make an insertion right after the site whose position is given
				self.seq.append(
					random.choice(list(self.alphabet)))
				self.occuredins.append(
					__site)  # Update the list of the sites after which an insertion has occured
				self.inserted_allele.extend([__base + self.seq[
					-1]])  # Update the list of inserted alleles
			elif __site in set(self.delpos):
				self.occureddel.append(
					__site)  # Update the list of the sited with deleted letter
			else:  # If not change is specified at the position, \
				# make a random change according to the prob model
				__prob = {'ins': self.insertprob.get(__base),
					  'del': self.deleteprob.get(__base),
					  'sub': self.muprob.get(__base)}
				__error = random.choice(['ins', 'del', 'sub',
							 'sub'])  # An error occurs randomly: insertion or \
				# deletion or substitution
				__rnd = float(int(
					random.random() * 100000)) / 100000  # The probability that this error is \
				# not corrected by replication machinary is determined \
				if __rnd < __prob.get(
					__error):  # by insertprob,deleteprob and muprob
					if __error == 'sub':
						self.seq.append(random.choice(self.mualphabet.get(__base)))
						self.occuredmu.append(__site)  # Update the list of the sites where a mutation has occured
						self.alt_allele.extend([self.seq[-1]])  # Update the list of alternative alleles
					elif __error == 'ins':
						self.seq.append(__base)
						self.seq.append(random.choice(list(self.alphabet)))
						self.occuredins.append(__site)  # Update the list of the sites after which an insertion has occured
						self.inserted_allele.extend([__base + self.seq[-1]])  # Update the list of inserted alleles
					elif __error == 'del':
						self.occureddel.append(__site)  # Update the list of the sited with deleted letter
				else:
					self.seq.append(__base)
		self.seq = ''.join(self.seq)
		self.seq = MutableSeq(self.seq, self.alphaproperty)
		if (self.occuredins):
			_ins_allele = zip(self.occuredins,
					  self.inserted_allele)
			_ins_allele.sort(key=lambda tup: tup[
				0])  # Sort the occured change positions
			self.occuredins, self.inserted_allele = zip(
				*_ins_allele)
			self.occuredins = list(self.occuredins)
			self.inserted_allele = list(self.inserted_allele)
			_ins_allele = None
		else:
			self.inserted_allele = []
			self.occuredins = []
		if (self.occuredmu):
			_alt_allele = zip(self.occuredmu, self.alt_allele)
			_alt_allele.sort(key=lambda tup: tup[0])
			self.occuredmu, self.alt_allele = zip(*_alt_allele)
			self.occuredmu = list(self.occuredmu)
			self.alt_allele = list(self.alt_allele)
			_alt_allele = None
		else:
			self.occuredmu = []
			self.alt_allele = []
		if (self.occureddel):
			self.occureddel.sort()
		else:
			self.occureddel = []
		if self.verbose:
			print("Changes made to the haplotype!")
Ejemplo n.º 25
0
    def viterbi(self, sequence, state_alphabet):
        """Calculate the most probable state path using the Viterbi algorithm.

        This implements the Viterbi algorithm (see pgs 55-57 in Durbin et
        al for a full explanation -- this is where I took my implementation
        ideas from), to allow decoding of the state path, given a sequence
        of emissions.

        Arguments:
         - sequence -- A Seq object with the emission sequence that we
           want to decode.
         - state_alphabet -- The alphabet of the possible state sequences
           that can be generated.

        """
        # calculate logarithms of the initial, transition, and emission probs
        log_initial = self._log_transform(self.initial_prob)
        log_trans = self._log_transform(self.transition_prob)
        log_emission = self._log_transform(self.emission_prob)

        viterbi_probs = {}
        pred_state_seq = {}
        state_letters = state_alphabet.letters

        # --- recursion
        # loop over the training squence (i = 1 .. L)
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        for i in range(0, len(sequence)):
            # loop over all of the possible i-th states in the state path
            for cur_state in state_letters:
                # e_{l}(x_{i})
                emission_part = log_emission[(cur_state, sequence[i])]

                max_prob = 0
                if i == 0:
                    # for the first state, use the initial probability rather
                    # than looking back to previous states
                    max_prob = log_initial[cur_state]
                else:
                    # loop over all possible (i-1)-th previous states
                    possible_state_probs = {}
                    for prev_state in self.transitions_to(cur_state):
                        # a_{kl}
                        trans_part = log_trans[(prev_state, cur_state)]

                        # v_{k}(i - 1)
                        viterbi_part = viterbi_probs[(prev_state, i - 1)]
                        cur_prob = viterbi_part + trans_part

                        possible_state_probs[prev_state] = cur_prob

                    # calculate the viterbi probability using the max
                    max_prob = max(possible_state_probs.values())

                # v_{k}(i)
                viterbi_probs[(cur_state, i)] = (emission_part + max_prob)

                if i > 0:
                    # get the most likely prev_state leading to cur_state
                    for state in possible_state_probs:
                        if possible_state_probs[state] == max_prob:
                            pred_state_seq[(i - 1, cur_state)] = state
                            break

        # --- termination
        # calculate the probability of the state path
        # loop over all states
        all_probs = {}
        for state in state_letters:
            # v_{k}(L)
            all_probs[state] = viterbi_probs[(state, len(sequence) - 1)]

        state_path_prob = max(all_probs.values())

        # find the last pointer we need to trace back from
        last_state = ''
        for state in all_probs:
            if all_probs[state] == state_path_prob:
                last_state = state

        assert last_state != '', "Didn't find the last state to trace from!"

        # --- traceback
        traceback_seq = MutableSeq('', state_alphabet)

        loop_seq = list(range(1, len(sequence)))
        loop_seq.reverse()

        # last_state is the last state in the most probable state sequence.
        # Compute that sequence by walking backwards in time. From the i-th
        # state in the sequence, find the (i-1)-th state as the most
        # probable state preceding the i-th state.
        state = last_state
        traceback_seq.append(state)
        for i in loop_seq:
            state = pred_state_seq[(i - 1, state)]
            traceback_seq.append(state)

        # put the traceback sequence in the proper orientation
        traceback_seq.reverse()

        return traceback_seq.toseq(), state_path_prob
Ejemplo n.º 26
0
markovBuilder.set_emission_score('O', 'C', .33)
markovBuilder.set_emission_score('O', 'G', .33)
markovBuilder.set_emission_score('O', 'S', .33)
markovBuilder.set_emission_score('P', 'A', .67)
markovBuilder.set_emission_score('P', 'T', .33)

#Menginisialisasi Hidden Markov Model
markovModel = markovBuilder.get_markov_model()

#3 sequence yang akan dialign
seq1 = Seq('ATGA', arrayDNA())
seq2 = Seq('A CCA', arrayDNA())
seq3 = Seq('ACAST', arrayDNA())

#state untuk tiap sequence
seq1State = MutableSeq('MNOP', arrayState())
seq2State = MutableSeq('MDIOP', arrayState())
seq3State = MutableSeq('MNIOP', arrayState())

seq = [seq1, seq2, seq3]
states = [seq1State, seq2State, seq3State]

#training Hidden Markov Model dengan sequence di atas
trainer = Trainer.KnownStateTrainer(markovModel)
for i in range(len(seq)):
    trainingseq = Trainer.TrainingSequence(seq[i], states[i])
    trainedhmm = trainer.train([trainingseq])

#contoh query yang lain
testSeq = Seq('ATSA', arrayDNA())
testState = MutableSeq('MNOP', arrayState())
Ejemplo n.º 27
0
 def setUp(self):
     self.alphabet = TestAlphabet()
     self.genome = MutableSeq("1234", self.alphabet)
     self.organism = Organism.Organism(self.genome, fitness_calculator)
Ejemplo n.º 28
0
    str_light_chain_one, str_light_chain_two,
    "ATGCGTATCGATCGCGATACGATTAGGCGGAT"
]


def u_crc32(seq):
    #NOTE - On Python 2 crc32 could return a signed int, but on Python 3 it is
    #always unsigned
    #Docs suggest should use crc32(x) & 0xffffffff for consistency.
    return crc32(seq) & 0xffffffff


for i, seq_str in enumerate(examples):
    print "Example %i, length %i, %s..." % (i + 1, len(seq_str), seq_str[:10])

    #Avoid cross platforms with printing floats by doing conversion explicitly
    def simple_LCC(s):
        return "%0.2f" % lcc_simp(s)

    def windowed_LCC(s):
        return ", ".join(["%0.2f" % v for v in lcc_mult(s, 20)])

    for checksum in [u_crc32, crc64, gcg, seguid, simple_LCC, windowed_LCC]:
        #First using a string:
        value = checksum(seq_str)
        print " %s = %s" % (checksum.__name__, value)
        #Secondly check it works with a Seq object
        assert value == checksum(Seq(seq_str, single_letter_alphabet))
        #Finally check it works with a MutableSeq object
        assert value == checksum(MutableSeq(seq_str, single_letter_alphabet))
Ejemplo n.º 29
0
    def test_count_overlap_start_end_GG(self):
        """Check our count_overlap method using GG with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 3),
            (3, None, 3),
            (3, 6, 2),
            (4, 6, 1),
            (4, -1, 2),
            (-5, None, 2),
            (-5, 7, 2),
            (7, -5, 0),
            (-100, None, 3),
            (None, 100, 3),
            (-100, 1000, 3),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("GG", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("GG", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 0),
            ("N", 1, 7, 0),
            ("N", -4, None, 0),
            ("N", -4, None, 0),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("GG", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("G", 100, 105, 0),
            ("G", -1, 4, 0),
            ("G", 4, -1, 0),
            ("G", -8, -2, 0),
            ("G", -2, -8, 0),
            ("G", 8, 2, 0),
            ("G", 2, 8, 0),
            ("GG", 8, 2, 0),
            ("GG", 2, 8, 0),
            ("GG", -5, -1, 0),
            ("GG", 1, 5, 0),
            ("GGG", None, None, 0),
            ("GGGGGGGGG", None, None, 0),
            ("GGG", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("GG", 1), 0)
Ejemplo n.º 30
0
def search_mutated_feature(vcf_record, gbk_dico):
    '''
    - Search if mutation is located within a coding sequence
    - determine if mutation is synonymous or not using a MutableSeq record (copy of the original record with mutation)
    '''
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from copy import copy
    from Bio.Alphabet import IUPAC
    from Bio.Seq import MutableSeq
    from Bio.Alphabet import generic_dna

    # create
    record_alt = copy(gbk_dico[vcf_record.CHROM])
    record_alt.seq = MutableSeq(str(record_alt.seq), generic_dna)

    results = {
        "mut_location": "Intergenic",
        "mut_type": '-',
        "orf_name": '-',
        "gene": '-'
    }

    for feature in record_alt.features:
        if int(vcf_record.POS) in feature and feature.type != "source":
            results["mut_location"] = feature.type
            if feature.type == 'mobile_element':
                results["orf_name"] = feature.qualifiers[
                    "mobile_element_type"][0]
            elif feature.type == 'CDS':
                results["orf_name"] = feature.qualifiers["locus_tag"][0]
            else:
                results[
                    "orf_name"] = "Unknown locus for feature: %s" % feature.type
            try:
                results["gene"] = feature.qualifiers["gene"][0]
            except KeyError:
                results["gene"] = '-'
            if feature.type == 'CDS':

                if len(vcf_record.ALT[0]) > 1:
                    results["mut_type"] = 'INDEL'
                    continue
                else:
                    aa_seq_ref = str(
                        feature.extract(record_alt.seq).translate())
                    # mutate reference sequence
                    if vcf_record.ALT[0] == '*':
                        # frameshift
                        results["mut_type"] = 'F'
                    else:
                        record_alt.seq[int(vcf_record.POS) - 1] = str(
                            vcf_record.ALT[0])

                        # check if synonymous or not
                        aa_seq_alt = str(
                            feature.extract(record_alt.seq).translate())
                        if str(aa_seq_ref) == str(aa_seq_alt):
                            results["mut_type"] = 'S'
                        else:
                            results["mut_type"] = extract_mutation(
                                aa_seq_ref, aa_seq_alt)

            return results
    # if no match, return empty results
    return results