Beispiel #1
0
def test_end_trim_with_mismatch():
    """
    Test the not-so-obvious case where an adapter of length 13 is trimmed from
    the end of a sequence with overlap 9 and there is one deletion.
    In this case the algorithm starts with 10 bases of the adapter to get
    the hit and so the match is considered good. An insertion or substitution
    at the same spot is not a match.
    """
    adapter = SingleAdapter('TCGATCGATCGAT', Where.BACK, max_error_rate=0.1)

    read = Sequence('foo1', 'AAAAAAAAAAATCGTCGATC')
    cutter = AdapterCutter([adapter], times=1)
    trimmed_read = cutter(read, ModificationInfo())

    assert trimmed_read.sequence == 'AAAAAAAAAAA'
    assert cutter.adapter_statistics[adapter].back.lengths == {9: 1}
    # We see 1 error at length 9 even though the number of allowed mismatches at
    # length 9 is 0.
    assert cutter.adapter_statistics[adapter].back.errors[9][1] == 1

    read = Sequence('foo2', 'AAAAAAAAAAATCGAACGA')
    cutter = AdapterCutter([adapter], times=1)
    trimmed_read = cutter(read, ModificationInfo())

    assert trimmed_read.sequence == read.sequence
    assert cutter.adapter_statistics[adapter].back.lengths == {}
Beispiel #2
0
    def test_rc_template_varialbe(self):
        renamer = Renamer("{id} rc={rc} {comment}")
        read = Sequence("theid thecomment", "ACGT")
        info = ModificationInfo(read)
        assert renamer(read, info).name == "theid rc= thecomment"

        read = Sequence("theid thecomment", "ACGT")
        info.is_rc = True
        assert renamer(read, info).name == "theid rc=rc thecomment"
Beispiel #3
0
 def test_ids_not_identical(self):
     renamer = PairedEndRenamer("{id} abc {comment} xyz")
     r1 = Sequence("theid_a cmtx", "ACGT")
     r2 = Sequence("theid_b cmty", "ACGT")
     info1 = ModificationInfo(r1)
     info2 = ModificationInfo(r2)
     with pytest.raises(ValueError) as e:
         renamer(r1, r2, info1, info2)
     assert "not identical" in e.value.args[0]
Beispiel #4
0
 def test_r2_comment(self):
     renamer = PairedEndRenamer("{id} abc {r2.comment} xyz")
     r1 = Sequence("theid cmtx", "ACGT")
     r2 = Sequence("theid cmty", "ACGT")
     info1 = ModificationInfo(r1)
     info2 = ModificationInfo(r2)
     renamed1, renamed2 = renamer(r1, r2, info1, info2)
     assert renamed1.name == "theid abc cmty xyz"
     assert renamed2.name == "theid abc cmty xyz"
Beispiel #5
0
 def test_read_number(self):
     renamer = PairedEndRenamer("{id} read no. is: {rn}")
     r1 = Sequence("theid cmtx", "ACGT")
     r2 = Sequence("theid cmty", "ACGT")
     info1 = ModificationInfo(r1)
     info2 = ModificationInfo(r2)
     renamed1, renamed2 = renamer(r1, r2, info1, info2)
     assert renamed1.name == "theid read no. is: 1"
     assert renamed2.name == "theid read no. is: 2"
Beispiel #6
0
def test_paired_adapter_cutter_actions(action, expected_trimmed1,
                                       expected_trimmed2):
    a1 = BackAdapter("GGTTAA")
    a2 = BackAdapter("AACCGG")
    s1 = Sequence("name", "CCCCGGTTAACCCC")
    s2 = Sequence("name", "TTTTAACCGGTTTT")
    pac = PairedAdapterCutter([a1], [a2], action=action)
    info1 = ModificationInfo(s1)
    info2 = ModificationInfo(s2)
    trimmed1, trimmed2 = pac(s1, s2, info1, info2)
    assert expected_trimmed1 == trimmed1.sequence
    assert expected_trimmed2 == trimmed2.sequence
Beispiel #7
0
def test_paired_adapter_cutter_actions(action, expected_trimmed1,
                                       expected_trimmed2):
    from cutadapt.adapters import SingleAdapter, Where
    a1 = SingleAdapter("GGTTAA", where=Where.BACK)
    a2 = SingleAdapter("AACCGG", where=Where.BACK)
    s1 = Sequence("name", "CCCCGGTTAACCCC")
    s2 = Sequence("name", "TTTTAACCGGTTTT")
    pac = PairedAdapterCutter([a1], [a2], action=action)
    info1 = ModificationInfo()
    info2 = ModificationInfo()
    trimmed1, trimmed2 = pac(s1, s2, info1, info2)
    assert expected_trimmed1 == trimmed1.sequence
    assert expected_trimmed2 == trimmed2.sequence
Beispiel #8
0
def test_unconditional_cutter():
    UnconditionalCutter(length=5)
    read = Sequence('r1', 'abcdefg')

    info = ModificationInfo(read)
    assert UnconditionalCutter(length=2)(read, info).sequence == 'cdefg'
    assert info.cut_prefix == 'ab'
    assert info.cut_suffix is None

    info = ModificationInfo(read)
    assert UnconditionalCutter(length=-2)(read, info).sequence == 'abcde'
    assert info.cut_suffix == 'fg'
    assert info.cut_prefix is None

    assert UnconditionalCutter(length=100)(read, info).sequence == ''
    assert UnconditionalCutter(length=-100)(read, info).sequence == ''
Beispiel #9
0
def test_action_retain():
    back = BackAdapter("AACCGG")
    ac = AdapterCutter([back], action="retain")
    seq = Sequence("r1", "ATTGCCAACCGGTATATAT")
    info = ModificationInfo(seq)
    trimmed = ac(seq, info)
    assert "ATTGCCAACCGG" == trimmed.sequence
Beispiel #10
0
def test_quality_trimmer():
    read = Sequence('read1', 'ACGTTTACGTA', '##456789###')

    qt = QualityTrimmer(10, 10, 33)
    assert qt(read,
              ModificationInfo(read)) == Sequence('read1', 'GTTTAC', '456789')

    qt = QualityTrimmer(0, 10, 33)
    assert qt(read,
              ModificationInfo(read)) == Sequence('read1', 'ACGTTTAC',
                                                  '##456789')

    qt = QualityTrimmer(10, 0, 33)
    assert qt(read,
              ModificationInfo(read)) == Sequence('read1', 'GTTTACGTA',
                                                  '456789###')
Beispiel #11
0
def test_shortener():
    read = Sequence('read1', 'ACGTTTACGTA', '##456789###')

    shortener = Shortener(0)
    assert shortener(read, ModificationInfo(read)) == Sequence('read1', '', '')

    shortener = Shortener(1)
    assert shortener(read,
                     ModificationInfo(read)) == Sequence('read1', 'A', '#')

    shortener = Shortener(5)
    assert shortener(read, ModificationInfo(read)) == Sequence(
        'read1', 'ACGTT', '##456')

    shortener = Shortener(100)
    assert shortener(read, ModificationInfo(read)) == read
Beispiel #12
0
def test_nend_trimmer():
    trimmer = NEndTrimmer()
    seqs = ['NNNNAAACCTTGGNNN', 'NNNNAAACNNNCTTGGNNN', 'NNNNNN']
    trims = ['AAACCTTGG', 'AAACNNNCTTGG', '']
    for seq, trimmed in zip(seqs, trims):
        _seq = Sequence('read1', seq, qualities='#' * len(seq))
        _trimmed = Sequence('read1', trimmed, qualities='#' * len(trimmed))
        assert trimmer(_seq, ModificationInfo(_seq)) == _trimmed
Beispiel #13
0
def test_reverse_complementer():
    adapters = [
        PrefixAdapter("TTATTTGTCT"),
        PrefixAdapter("TCCGCACTGG"),
    ]
    adapter_cutter = AdapterCutter(adapters, index=False)
    reverse_complementer = ReverseComplementer(adapter_cutter)

    read = Sequence("r", "ttatttgtctCCAGCTTAGACATATCGCCT")
    info = ModificationInfo(read)
    trimmed = reverse_complementer(read, info)
    assert trimmed.sequence == "CCAGCTTAGACATATCGCCT"
    assert not info.is_rc

    read = Sequence("r", "CAACAGGCCACATTAGACATATCGGATGGTagacaaataa")
    info = ModificationInfo(read)
    trimmed = reverse_complementer(read, info)
    assert trimmed.sequence == "ACCATCCGATATGTCTAATGTGGCCTGTTG"
    assert info.is_rc
Beispiel #14
0
def cutadapt(fq):
    readDict = {}
    for fqreads in fq:
        if int(cu_ver) >= 3:
            info = ModificationInfo(None)
            info.matches = []
        else:
            matches = []
        if qiagenumi:
            currentSeq = fqreads.sequence
            umi_seq = ""
            for modifier in ingredients:
                if int(cu_ver) >= 3:
                    fqreads = modifier(fqreads, info)
                else:
                    fqreads = modifier(fqreads, matches)
            try:
                umi_seq = currentSeq.split(str(fqreads.sequence))[1]
                umi_cut = umi.split(",")
                max_ad = len(qiaAdapter) + int(umi_cut[1])
                umi_seq = umi_seq[:max_ad][-int(umi_cut[1]):]
            except ValueError:
                umi_seq = ""
            final_seq = fqreads.sequence + umi_seq
            if int(len(final_seq)) >= int(min_len):
                if str(final_seq) in readDict:
                    readDict[str(final_seq)] += 1
                else:
                    readDict[str(final_seq)] = 1
        else:
            for modifier in ingredients:
                if int(cu_ver) >= 3:
                    fqreads = modifier(fqreads, info)
                else:
                    fqreads = modifier(fqreads, matches)
            if int(len(fqreads.sequence)) >= int(min_len):
                if str(fqreads.sequence) in readDict:
                    readDict[str(fqreads.sequence)] += 1
                else:
                    readDict[str(fqreads.sequence)] = 1
    trimmed_pairs = list(readDict.items())
    return trimmed_pairs
Beispiel #15
0
def test_statistics():
    read = Sequence('name', 'AAAACCCCAAAA')
    adapters = [SingleAdapter('CCCC', Where.BACK, max_error_rate=0.1)]
    cutter = AdapterCutter(adapters, times=3)
    cutter(read, ModificationInfo())
    # TODO make this a lot simpler
    trimmed_bp = 0
    for adapter in adapters:
        for d in (cutter.adapter_statistics[adapter].front.lengths,
                cutter.adapter_statistics[adapter].back.lengths):
            trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items())
    assert trimmed_bp <= len(read), trimmed_bp
Beispiel #16
0
def test_anywhere_parameter_front():
    parser = AdapterParser(max_error_rate=0.2, min_overlap=4, read_wildcards=False,
        adapter_wildcards=False, indels=True)
    adapter = list(parser.parse('CTGAAGTGAAGTACACGGTT;anywhere', 'front'))[0]
    assert isinstance(adapter, FrontAdapter)
    assert adapter._force_anywhere

    # TODO move the rest to a separate test
    read = Sequence('foo1', 'AAAAAAAAAACTGAAGTGAA')
    from cutadapt.modifiers import AdapterCutter
    cutter = AdapterCutter([adapter])
    trimmed_read = cutter(read, ModificationInfo(read))
    assert trimmed_read.sequence == ''
Beispiel #17
0
def test_anywhere_with_errors():
    adapter = SingleAdapter('CCGCATTTAG', Where.ANYWHERE, max_error_rate=0.1)
    for seq, expected_trimmed in (
        ('AACCGGTTccgcatttagGATC', 'AACCGGTT'),
        ('AACCGGTTccgcgtttagGATC', 'AACCGGTT'),  # one mismatch
        ('AACCGGTTccgcatttag', 'AACCGGTT'),
        ('ccgcatttagAACCGGTT', 'AACCGGTT'),
        ('ccgtatttagAACCGGTT', 'AACCGGTT'),  # one mismatch
        ('ccgatttagAACCGGTT', 'AACCGGTT'),  # one deletion
    ):
        read = Sequence('foo', seq)
        cutter = AdapterCutter([adapter], times=1)
        trimmed_read = cutter(read, ModificationInfo())
        assert trimmed_read.sequence == expected_trimmed
Beispiel #18
0
def test_linked_action_retain(s, expected):
    front = FrontAdapter("GGTTAACC")
    back = BackAdapter("AACCGG")
    adapters: List[Adapter] = [
        LinkedAdapter(front,
                      back,
                      front_required=False,
                      back_required=False,
                      name="linked")
    ]
    ac = AdapterCutter(adapters, action="retain")
    seq = Sequence("r1", s)
    info = ModificationInfo(seq)
    trimmed = ac(seq, info)
    assert expected == trimmed.sequence
Beispiel #19
0
def test_anywhere_parameter():
    parser = AdapterParser(max_error_rate=0.2,
                           min_overlap=4,
                           read_wildcards=False,
                           adapter_wildcards=False,
                           indels=True)
    adapter = list(parser.parse('CTGAAGTGAAGTACACGGTT;anywhere', 'back'))[0]
    assert isinstance(adapter, SingleAdapter)
    assert adapter.remove == WhereToRemove.SUFFIX
    assert adapter.where is Where.ANYWHERE
    read = Sequence('foo1', 'TGAAGTACACGGTTAAAAAAAAAA')
    from cutadapt.modifiers import AdapterCutter
    cutter = AdapterCutter([adapter])
    trimmed_read = cutter(read, ModificationInfo())
    assert trimmed_read.sequence == ''
Beispiel #20
0
 def test_comment_template_variable_missing_comment(self):
     renamer = Renamer("{id}_extra {comment}")
     read = Sequence("theid", "ACGT")
     info = ModificationInfo(read)
     assert renamer(read, info).name == "theid_extra "
Beispiel #21
0
 def test_id_template_variable(self):
     renamer = Renamer("{id} extra")
     read = Sequence("theid thecomment", "ACGT")
     info = ModificationInfo(read)
     assert renamer(read, info).name == "theid extra"
Beispiel #22
0
 def test_cut_suffix_template_variable(self):
     renamer = Renamer("{id}_{cut_suffix} {comment}")
     read = Sequence("theid thecomment", "ACGT")
     info = ModificationInfo(read)
     info.cut_suffix = "TTAAGG"
     assert renamer(read, info).name == "theid_TTAAGG thecomment"
Beispiel #23
0
def cutadapt(fq):
    #ourDir_n = str("miRge3_temp")
    #tempDir = Path.cwd()/ourDir_n
    #Path(tempDir).mkdir(exist_ok=True, parents=True)
    #tempFile = Path(tempDir)/"input_EC.fastq"
    in_fqfile = open("input.fq", "a+")
    in_fqfileCor = open("correct_read.fastq", "a+")
    id_read = open("id_read.txt", "a+")
    id_read_qc = open("ID_read_quality_cor.txt", "a+")
    id_read_qcin = open("ID_read_quality_input.txt", "a+")
    readDict = {}
    for fqreads in fq:
        if int(cu_ver) >= 3:
            info = ModificationInfo(None)
            info.matches = []
        else:
            matches = []
        if qiagenumi:
            currentSeq = fqreads.sequence
            umi_seq = ""
            for modifier in ingredients:
                if int(cu_ver) >= 3:
                    fqreads = modifier(fqreads, info)
                else:
                    fqreads = modifier(fqreads, matches)
            try:
                umi_seq = currentSeq.split(str(fqreads.sequence))[1]
                umi_cut = umi.split(",")
                max_ad = len(qiaAdapter) + int(umi_cut[1])
                umi_seq = umi_seq[:max_ad][-int(umi_cut[1]):]
            except ValueError:
                umi_seq = ""
            final_seq = fqreads.sequence + umi_seq
            if int(len(final_seq)) >= int(min_len):
                if str(final_seq) in readDict:
                    readDict[str(final_seq)] += 1
                else:
                    readDict[str(final_seq)] = 1
        else:
            for modifier in ingredients:
                if int(cu_ver) >= 3:
                    fqreads = modifier(fqreads, info)
                else:
                    fqreads = modifier(fqreads, matches)
            if int(len(fqreads.sequence)) >= int(min_len):
                in_fqfile.write("@" + str((fqreads.name).split(" ")[0]) +
                                "\n" + str(fqreads.sequence) + "\n" +
                                str("+") + "\n" + str((fqreads.qualities)) +
                                "\n")
                in_fqfileCor.write("@" + str((fqreads.name).split(" ")[0]) +
                                   "\n" + str(fqreads.sequence) + "\n" +
                                   str("+") + "\n" + str((fqreads.qualities)) +
                                   "\n")
                id_read.write("@" + str((fqreads.name).split(" ")[0]) + " " +
                              str(fqreads.sequence) + " " +
                              str((fqreads.qualities)) + "\n")
                id_read_qc.write("@" + str((fqreads.name).split(" ")[0]) +
                                 " " + str(fqreads.sequence) + " " +
                                 str((fqreads.qualities)) + "\n")
                id_read_qcin.write("@" + str((fqreads.name).split(" ")[0]) +
                                   " " + str(fqreads.sequence) + " " +
                                   str((fqreads.qualities)) + "\n")
                if str(fqreads.sequence) in readDict:
                    readDict[str(fqreads.sequence)] += 1
                    #<Sequence(name='SRR772403.13 SN603_WA038_2_1102_1455.00_139.00_0 length=50', sequence='TACCCTGTAGAAACGAATTTGT', qualities='@@@DDDDDFFFF<+AFFGFEIF')>
                    #<Sequence(name='SRR772403.133926 SN603_WA038_2_1205_884.60_9723.70_0 length=50', sequence='TGAGATGAAGCACTGTAGCT', qualities='CCCFFFFFHHHHHJJIIJJJ')>
                else:
                    readDict[str(fqreads.sequence)] = 1

    trimmed_pairs = list(readDict.items())
    return trimmed_pairs
Beispiel #24
0
def test_zero_capper():
    zc = ZeroCapper()
    read = Sequence("r1", "ACGT", "# !%")
    result = zc(read, ModificationInfo(read))
    assert result.sequence == "ACGT"
    assert result.qualities == "#!!%"