Ejemplo n.º 1
0
    def test_slice(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n'])
        expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta')
        assert slice_seq(seq, 1, 5) == expected_seq

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = slice_seq(seq, 1, 3)
        assert list(get_qualities(seq)) == [30, 0]
        assert get_str_seq(seq) == 'at'
        assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n']

        # with multiline fastq
        seq = SeqItem(
            name='seq',
            lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n', '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq_ = slice_seq(seq, 1, 5)
        assert list(get_qualities(seq_)) == [1, 1, 1, 2]
        assert get_str_seq(seq_) == get_str_seq(seq)[1:5]

        # It tests the stop is None
        seq = SeqItem('seq', ['>seq\n', 'aCTG'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:]

        assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
Ejemplo n.º 2
0
    def test_str_qualities(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        try:
            assert get_str_qualities(seq, 'fasta')
            self.fail('ValueError expected')
        except ValueError:
            pass

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_qualities(seq) == '!???'

        # with fastq to fastq-illumina
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_qualities(seq, ILLUMINA_QUALITY) == '@^^^'

        # with multiline fastq-illumina
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        assert get_str_qualities(seq, ILLUMINA_QUALITY) == '@AAABBBB'

        # with multiline fastq-illumina to fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        assert get_str_qualities(seq, 'fastq') == '!"""####'
Ejemplo n.º 3
0
def _build_some_paired_seqs():
    seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta')
    seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta')
    seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta')
    seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta')
    seqs = seq1, seq2, seq3, seq4
    return seqs
Ejemplo n.º 4
0
    def test_seqitem_pairs_equal(self):
        seq1 = SeqWrapper(
            SEQITEM, SeqItem('seq1',
                             ['@seq1\n', 'TAATAC\n', '+\n', 'TTTDFG\n']),
            'fastq')
        seq2 = SeqWrapper(
            SEQITEM, SeqItem('seq2',
                             ['@seq2\n', 'TCATTA\n', '+\n', 'ABCBEG\n']),
            'fastq')
        seq3 = SeqWrapper(
            SEQITEM, SeqItem('seq3',
                             ['@seq3\n', 'TAATAC\n', '+\n', 'TTTDFG\n']),
            'fastq')
        seq4 = SeqWrapper(
            SEQITEM, SeqItem('seq4',
                             ['@seq4\n', 'ACGCGT\n', '+\n', 'ABCBEG\n']),
            'fastq')
        pair1 = (seq1, seq2)
        pair2 = (seq2, seq4)
        pair3 = (seq3, seq2)
        pair4 = (seq2, seq1)

        assert _seqitem_pairs_equal(pair1, pair3)
        assert not _seqitem_pairs_equal(pair1, pair2)
        assert not _seqitem_pairs_equal(pair1, pair4)
        assert _seqitem_pairs_equal([seq1], [seq3])
        assert not _seqitem_pairs_equal([seq1], [seq2])
        assert not _seqitem_pairs_equal([seq1], pair1)
        assert not _seqitem_pairs_equal(pair1, seq2)
Ejemplo n.º 5
0
def _itemize_fastx_multiline(fhand):  # this is a generator function
    last_line = None  # this is a buffer keeping the last unprocessed line
    is_empty = True
    n_single_line_seqs = 0
    n_seqs_read = 0
    while True:  # mimic closure; is it a bad idea?
        if not last_line:  # the first record or a record following a fastq
            for line in fhand:  # search for the start of the next record
                if line[0] in '@>':  # fasta/q header line
                    last_line = line  # save this line
                    break
        if not last_line:
            break
        title = last_line
        seq_lines = []
        last_line = None
        name = title[1:-1].partition(" ")[0]
        for line in fhand:  # read the sequence
            if line[0] in '@+>':
                last_line = line
                break
            seq_lines.append(line.rstrip())
        if not last_line or last_line[0] != '+':  # this is a fasta record
            yield SeqItem(name, [title, ''.join(seq_lines) + '\n'])
            n_seqs_read += 1
            is_empty = False
            if not last_line:
                break
        else:  # this is a fastq record
            seq = ''.join(seq_lines)
            length = 0
            qual_lines = []
            len_seq = len(seq)
            for line in fhand:  # read the quality
                qual_lines.append(line.rstrip())
                length += len(line) - 1
                if length >= len_seq:  # have read enough quality
                    if length != len_seq:
                        msg = 'Malformed fastq file: seq and quality lines'
                        msg += 'have different lengths'
                        raise MalformedFile(msg)
                    last_line = None
                    is_empty = False
                    yield SeqItem(name, [title, seq + '\n', '+\n',
                                ''.join(qual_lines) + '\n'])
                    n_seqs_read += 1
                    if len(qual_lines) == 1:
                        n_single_line_seqs += 1
                        if n_seqs_read == 1000:
                            if n_single_line_seqs == n_seqs_read:
                                raise IsSingleLineFastqError()
                    break
            if last_line:  # reach EOF before reading enough quality
                msg = 'Malformed fastq file: quality line missing'
                raise MalformedFile(msg)
    if is_empty:
        raise FileIsEmptyError('File is empty')
Ejemplo n.º 6
0
    def test_trim_seqs():
        'It tests the trim seq function'
        seqs = []
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)])

        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC', 'CTTC', 'CTC', 'AC']

        seqs = []
        seq = SeqItem('s', ['>s\n', 'aaCTTTC\n'])
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC']

        # with pairs
        seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n'])
        seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n'])
        seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n'])
        seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n'])
        seqs = []
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta'),
                     SeqWrapper(SEQITEM, seq1, 'fasta')])
        seqs.append([SeqWrapper(SEQITEM, seq2, 'fasta'),
                     SeqWrapper(SEQITEM, seq3, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['CTTTC']
        assert ['CTTTC', 'CTTTC'] == res

        # no drag
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['s1.r']
        assert ['CTTTC', 'CTTTC'] == res
Ejemplo n.º 7
0
    def test_str_seq(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(seq) == 'ACTGGTAC'

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_seq(seq) == 'aaaa'
Ejemplo n.º 8
0
 def test_pair_grouper():
     seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta')
     seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta')
     seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta')
     seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta')
     seqs = seq1, seq2, seq3, seq4
     paired_seqs = list(group_seqs_in_pairs(seqs))
     assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
     assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G']
     assert len(paired_seqs) == 2
Ejemplo n.º 9
0
    def test_len(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_length(seq) == 8

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_length(seq) == 4
Ejemplo n.º 10
0
    def test_change_name(self):
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaa\n', '+seq\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = copy_seq(seq, name='seq2')
        assert seq.object == ('seq2', ['@seq2\n', 'aaaa\n', '+\n',
                                       '!???\n'], {})

        seq = SeqItem(name='seq', lines=['>seq\n', 'aaaa\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        seq = copy_seq(seq, name='seq2')
        assert seq.object == ('seq2', ['>seq2\n', 'aaaa\n'], {})
Ejemplo n.º 11
0
 def __init__(self, linkers=None):
     'The initiator'
     if linkers is None:
         linkers = get_setting('LINKERS')
         linkers = [SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers)]
         linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta')
     self.linkers = list(linkers)
Ejemplo n.º 12
0
    def test_no_name(self):
        seqs = _build_some_paired_seqs()
        seq = SeqWrapper(SEQITEM, SeqItem('s', ['>s\n', 'N\n']), 'fasta')

        seqs = seqs[0], seqs[1], seqs[2], seq, seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['G']

        seqs = _build_some_paired_seqs()
        seqs = seqs[0], seq, seqs[1], seqs[2], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['C']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['T', 'G']

        seqs = _build_some_paired_seqs()
        seqs = seq, seqs[0], seqs[1], seqs[2], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['T', 'G']
Ejemplo n.º 13
0
    def test_blast_short_trimming(self):
        'It trims oligos using blast-short'

        oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG'))
        oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT'))
        oligo1 = SeqWrapper(SEQRECORD, oligo1, None)
        oligo2 = SeqWrapper(SEQRECORD, oligo2, None)

        adaptors = [oligo1, oligo2]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = read_seq_packets([fhand],
                                       prefered_seq_classes=[SEQRECORD])
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = blast_trim(trim_packets[0])
        # It should trim the first and the second reads.
        res = [
            get_annotations(s).get(TRIMMING_RECOMMENDATIONS,
                                   {}).get(VECTOR, [])
            for l in trim_packets2[SEQS_PASSED] for s in l
        ]
        assert res == [[(0, 29)], [(0, 29)], []]

        # With SeqItems
        oligo1 = SeqItem('oligo1',
                         ['>oligo1\n', 'AAGCAGTGGTATCAACGCAGAGTACATGGG\n'])
        oligo2 = SeqItem('oligo2',
                         ['>oligo2\n', 'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n'])
        oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta')
        oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta')

        adaptors = [oligo1, oligo2]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = list(
            read_seq_packets([fhand], prefered_seq_classes=[SEQITEM]))
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = blast_trim(trim_packets[0])
        # It should trim the first and the second reads.
        res = [
            get_annotations(s).get(TRIMMING_RECOMMENDATIONS,
                                   {}).get(VECTOR, [])
            for l in trim_packets2[SEQS_PASSED] for s in l
        ]
        assert res == [[(0, 29)], [(0, 29)], []]
Ejemplo n.º 14
0
def _itemize_fasta(fhand):
    'It returns the fhand divided in chunks, one per seq'

    lines = []
    for line in fhand:
        if not line or line.isspace():
            continue
        if line.startswith('>'):
            if lines:
                yield SeqItem(_get_name_from_lines(lines), lines)
                lines = []
        lines.append(line)
        if len(lines) == 1 and not lines[0].startswith('>'):
            raise RuntimeError('Not a valid fasta file')
    else:
        if lines:
            yield SeqItem(_get_name_from_lines(lines), lines)
Ejemplo n.º 15
0
    def test_matching_segments(self):
        'It tests the detection of oligos in sequence files'
        seq_5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC'
        mate_fhand = create_a_matepair_file()

        linkers = [
            SeqItem('titan', ['>titan\n', TITANIUM_LINKER + '\n']),
            SeqItem('flx', ['>flx\n', FLX_LINKER + '\n'])
        ]
        linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta')

        expected_region = (len(seq_5), len(seq_5 + TITANIUM_LINKER) - 1)
        matcher = BlasterForFewSubjects(mate_fhand.name,
                                        linkers,
                                        program='blastn',
                                        elongate_for_global=True)
        linker_region = matcher.get_matched_segments_for_read('seq1')[0]
        assert [expected_region] == linker_region
Ejemplo n.º 16
0
    def test_int_qualities(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        try:
            assert get_int_qualities(seq)
            self.fail('AttributeError expected')
        except AttributeError:
            pass

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert list(get_int_qualities(seq)) == [0, 30, 30, 30]

        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        assert list(get_int_qualities(seq)) == [0, 1, 1, 1, 2, 2, 2, 2]
Ejemplo n.º 17
0
    def test_copy(self):
        # with fasta
        seq = SeqItem(name='s1',
                      lines=['>s1\n', 'ACTG\n', 'GTAC\n'],
                      annotations={'a': 'b'})
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        seq2 = copy_seq(seq, seq='ACTG')
        assert seq2.object == SeqItem(name='s1',
                                      lines=['>s1\n', 'ACTG\n'],
                                      annotations={'a': 'b'})
        assert seq.object is not seq2.object
        assert seq.object.lines is not seq2.object.lines

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq2 = copy_seq(seq, seq='ACTG')
        assert seq2.object == SeqItem(
            name='seq', lines=['@seq\n', 'ACTG\n', '+\n', '!???\n'])

        # with multiline fastq
        seq = SeqItem(
            name='seq',
            lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n', '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq2 = copy_seq(seq, seq='ACTGactg')
        assert seq2.object == SeqItem(
            name='seq', lines=['@seq\n', 'ACTGactg\n', '+\n', '@AAABBBB\n'])
Ejemplo n.º 18
0
def alignedread_to_seqitem(aligned_read, start_pos=0, end_pos=None):
    if aligned_read is None or aligned_read.seq is None:
        return None
    name = aligned_read.qname
    seq = aligned_read.seq[start_pos: end_pos]
    quals = aligned_read.qual
    if aligned_read.is_reverse:
        seq = _reverse(_complementary(seq))
    if quals is None:
        lines = ['>' + name + '\n', seq + '\n']
        file_format = 'fasta'
    else:
        quals = quals[start_pos: end_pos]
        if aligned_read.is_reverse:
            quals = _reverse(quals)
        lines = ['@' + name + '\n', seq + '\n', '+\n', quals + '\n']
        file_format = 'fastq'
    return SeqWrapper(SEQITEM, SeqItem(name, lines), file_format)
Ejemplo n.º 19
0
    def test_edge_trimming(self):
        'It trims the edges'
        trim = TrimOrMask()
        trim_edges = TrimEdges(left=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['CCG', 'AACCCGGG']

        trim_edges = TrimEdges(right=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['ACC', 'AAACCCGG']

        trim_edges = TrimEdges(left=1, right=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['CC', 'AACCCGG']

        trim_edges = TrimEdges(left=2, right=2)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['ACCCG']

        trim_edges = TrimEdges(left=3, right=3)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['CCC']

        trim = TrimOrMask(mask=True)
        trim_edges = TrimEdges(left=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['aCCG', 'aAACCCGGG']

        trim_edges = TrimEdges(right=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['ACCg', 'AAACCCGGg']

        trim_edges = TrimEdges(left=1, right=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['aCCg', 'aAACCCGGg']

        trim_edges = TrimEdges(left=2, right=2)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['accg', 'aaACCCGgg']

        trim_edges = TrimEdges(left=3, right=3)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['accg', 'aaaCCCggg']

        # test overlapping mask
        trim1 = TrimEdges(left=3, right=3)
        trim2 = TrimEdges(left=4, right=4)
        trim_packet = trim(trim2(trim1(self._some_seqs())))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['accg', 'aaacCcggg']

        # With a SeqItem
        trim = TrimOrMask(mask=False)
        trim_edges = TrimEdges(left=1, right=1)

        seq = SeqItem('s', ['>s\n', 'ACTTTC\n'])
        seqs = [[SeqWrapper(SEQITEM, seq, 'fasta')]]
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}

        trim_packet = trim(trim_edges(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['CTTT']

        trim = TrimOrMask(mask=True)
        seq = SeqItem('s', ['>s\n', 'ACTTTC\n'])
        seqs = [[SeqWrapper(SEQITEM, seq, 'fasta')]]
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_packet = trim(trim_edges(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['aCTTTc']
Ejemplo n.º 20
0
    def test_quality_trimming(self):
        'It trims the edges'
        trim = TrimOrMask()

        trim_quality = TrimByQuality(window=5, threshold=30)

        seq = SeqRecord(Seq('ACTGCTGCATAAAA'))
        quals = [10, 10, 20, 30, 30, 30, 40, 40, 30, 30, 20, 20, 10, 10]
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_int_qualities(seq2) == [20, 30, 30, 30, 40, 40, 30, 30, 20]

        # all bad
        trim_quality = TrimByQuality(window=5, threshold=60)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        assert not trim_packet2[SEQS_PASSED]

        # all OK
        trim_quality = TrimByQuality(window=5, threshold=5)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_int_qualities(seq2) == quals

        seq = SeqRecord(Seq('ACTGCTGCATAA'))
        quals = [20, 20, 20, 60, 60, 60, 60, 60, 20, 20, 20, 20]

        trim_quality = TrimByQuality(window=5, threshold=50)
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_int_qualities(seq2) == [20, 60, 60, 60, 60, 60, 20]

        quals = [
            40, 18, 10, 40, 40, 5, 8, 30, 14, 3, 40, 40, 40, 11, 6, 5, 3, 20,
            10, 12, 8, 5, 4, 7, 1
        ]
        seq = SeqRecord(Seq('atatatatagatagatagatagatg'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_quality = TrimByQuality(window=5, threshold=25)
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_int_qualities(seq2) == [40, 18, 10, 40, 40]

        quals = [
            40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10,
            21, 3, 40, 9, 9, 12, 10, 9
        ]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_quality = TrimByQuality(window=5, threshold=25)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}

        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        expected = [40, 4, 27, 38, 40]
        assert get_int_qualities(seq2) == expected

        quals = [
            40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10,
            21, 3, 40, 9, 9, 12, 10, 9
        ]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_quality = TrimByQuality(window=5, threshold=25, trim_left=False)
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_int_qualities(seq2) == [
            40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40
        ]

        quals = [
            40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10,
            21, 3, 40, 9, 9, 12, 10, 9
        ]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False)
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_int_qualities(seq2) == [
            40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10,
            9
        ]

        quals = [
            40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10,
            21, 3, 40, 9, 9, 12, 10, 9
        ]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_quality = TrimByQuality(window=5,
                                     threshold=25,
                                     trim_right=False,
                                     trim_left=False)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_int_qualities(seq2) == quals

        # With SeqItems
        seq = SeqItem('s', [
            '@s\n', 'atatatatatatatatatatatata\n', '\n',
            'II.,I*I%<GI%,II++6$I**-+*\n'
        ])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        trim_quality = TrimByQuality(window=5,
                                     threshold=25,
                                     trim_right=True,
                                     trim_left=False)

        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert seq2.object.lines[3] == 'II.,I*I%<GI\n'
Ejemplo n.º 21
0
def _itemize_fastq_singleline(fhand):
    'It returns the fhand divided in chunks, one per seq'
    # group_in_packets_fill_last is faster than group_in_packets
    blobs = group_in_packets_fill_last(ifilter(_line_is_not_empty, fhand), 4)
    return (SeqItem(_get_name_from_lines(lines), lines) for lines in blobs)