Esempio n. 1
0
    def test_slice(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n'])
        expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta')
        assert slice_seq(seq, 1, 5) == expected_seq

        # with fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aata\n', '+\n', '!?!?\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = slice_seq(seq, 1, 3)
        assert list(get_qualities(seq)) == [30, 0]
        assert get_str_seq(seq) == 'at'
        assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n']

        # with multiline fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n',
                                         '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq_ = slice_seq(seq, 1, 5)
        assert list(get_qualities(seq_)) == [1, 1, 1, 2]
        assert get_str_seq(seq_) == get_str_seq(seq)[1: 5]

        # It tests the stop is None
        seq = SeqItem('seq', ['>seq\n', 'aCTG'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:]

        assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
Esempio n. 2
0
    def test_slice(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n'])
        expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta')
        assert slice_seq(seq, 1, 5) == expected_seq

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = slice_seq(seq, 1, 3)
        assert list(get_qualities(seq)) == [30, 0]
        assert get_str_seq(seq) == 'at'
        assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n']

        # with multiline fastq
        seq = SeqItem(
            name='seq',
            lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n', '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq_ = slice_seq(seq, 1, 5)
        assert list(get_qualities(seq_)) == [1, 1, 1, 2]
        assert get_str_seq(seq_) == get_str_seq(seq)[1:5]

        # It tests the stop is None
        seq = SeqItem('seq', ['>seq\n', 'aCTG'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:]

        assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
Esempio n. 3
0
    def test_qualities(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        try:
            assert get_qualities(seq)
            self.fail('AttributeError expected')
        except AttributeError:
            pass

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert list(get_qualities(seq)) == [0, 30, 30, 30]

        # with multiline fastq
        seq = SeqItem(
            name='seq',
            lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n', '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        assert list(get_qualities(seq)) == [0, 1, 1, 1, 2, 2, 2, 2]
Esempio n. 4
0
    def test_qualities(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        try:
            assert get_qualities(seq)
            self.fail('AttributeError expected')
        except AttributeError:
            pass

        # with fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert list(get_qualities(seq)) == [0, 30, 30, 30]

        # with multiline fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n',
                                         '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        assert list(get_qualities(seq)) == [0, 1, 1, 1, 2, 2, 2, 2]
Esempio n. 5
0
    def _do_trim(self, seq):
        'It trims the masked segments of the seqrecords.'
        window = self.window
        threshold = self.threshold
        trim_left = self.trim_left
        trim_right = self.trim_right
        try:
            quals = list(get_qualities(seq))
        except KeyError:
            msg = 'Some of the input sequences do not have qualities: {}'
            msg = msg.format(get_name(seq))
        segments = _get_bad_quality_segments(quals, window, threshold,
                                            trim_left, trim_right)
        if segments is not None:
            _add_trim_segments(segments, seq, kind=QUALITY)

        return seq
Esempio n. 6
0
    def _do_trim(self, seq):
        'It trims the masked segments of the seqrecords.'
        window = self.window
        threshold = self.threshold
        trim_left = self.trim_left
        trim_right = self.trim_right
        try:
            quals = list(get_qualities(seq))
        except KeyError:
            msg = 'Some of the input sequences do not have qualities: {}'
            msg = msg.format(get_name(seq))
        segments = _get_bad_quality_segments(quals, window, threshold,
                                             trim_left, trim_right)
        if segments is not None:
            _add_trim_segments(segments, seq, kind=QUALITY)

        return seq
Esempio n. 7
0
def calculate_sequence_stats(seqs, kmer_size=None, do_dust_stats=False,
                             nxs=None):
    'It calculates some stats for the given seqs.'
    # get data
    lengths = IntCounter()
    quals_per_pos = IntBoxplot()
    nucl_freq = NuclFreqsPlot()
    kmer_counter = KmerCounter(kmer_size) if kmer_size else None
    dustscores = IntCounter()
    for seq in seqs:
        lengths[get_length(seq)] += 1
        try:
            quals = get_qualities(seq)
        except AttributeError:
            quals = []
        for index, qual in enumerate(quals):
            quals_per_pos.append(index + 1, qual)
        str_seq = get_str_seq(seq)
        for index, nucl in enumerate(str_seq):
            nucl_freq.append(index, nucl)
        if kmer_counter is not None:
            kmer_counter.count_seq(str_seq)
        if do_dust_stats:
            dustscore = calculate_dust_score(seq)
            if dustscore is not None:
                dustscores[int(dustscore)] += 1

    lengths.update_labels({'sum': 'tot. residues', 'items': 'num. seqs.'})

    # length distribution
    lengths_srt = 'Length stats and distribution.\n'
    lengths_srt += '------------------------------\n'
    nxs = sorted(nxs) if nxs else []
    for nx in sorted(nxs):
        lengths_srt += 'N{:d}: {:d}\n'.format(nx, calculate_nx(lengths, nx))
    lengths_srt += str(lengths)
    lengths_srt += '\n'

    # agregate quals
    if quals_per_pos:
        quals = quals_per_pos.aggregated_array
        quals.update_labels({'sum': None, 'items': 'tot. base pairs'})

        q30 = quals.count_relative_to_value(30, operator.ge) / quals.count
        q30 *= 100

        q20 = quals.count_relative_to_value(20, operator.ge) / quals.count
        q20 *= 100

        # qual distribution
        qual_str = 'Quality stats and distribution.\n'
        qual_str += '-------------------------------\n'
        qual_str += 'Q20: {:.2f}\n'.format(q20)
        qual_str += 'Q30: {:.2f}\n'.format(q30)
        qual_str += str(quals)
        qual_str += '\n'

        # qual per position boxplot
        qual_boxplot = 'Boxplot for quality per position.\n'
        qual_boxplot += '---------------------------------\n'
        qual_boxplot += quals_per_pos.ascii_plot
        qual_boxplot += '\n'
    else:
        qual_str = ''
        qual_boxplot = ''

    # nucl freqs
    freq_str = 'Nucleotide frequency per position.\n'
    freq_str += '----------------------------------\n'
    freq_str += nucl_freq.ascii_plot
    freq_str += '\n'

    # kmer_distriubution
    kmer_str = ''
    if kmer_counter is not None:
        kmers = IntCounter(kmer_counter.values)
        if kmers:
            kmers.update_labels({'sum': None, 'items': 'num. kmers'})
            kmer_str = 'Kmer distribution\n'
            kmer_str += '-----------------\n'
            kmer_str += str(kmers)
            kmer_str += '\n'
            kmer_str += 'Most common kmers:\n'
            for kmer, number in kmer_counter.most_common(20):
                kmer_str += '\t{}: {}\n'.format(kmer, number)

    dust_str = ''
    if dustscores:
        dustscores.update_labels({'sum': None, 'items': 'num. seqs.'})
        dust_str = 'Dustscores stats and distribution.\n'
        dust_str += '----------------------------------\n'
        dust7 = (dustscores.count_relative_to_value(7, operator.gt) /
                 dustscores.count)
        dust_str += '% above 7 (low complexity): {:.2f}\n'.format(dust7)
        dust_str += str(dustscores)
        dust_str += '\n'

    return {'length': lengths_srt,
            'quality': qual_str,
            'nucl_freq': freq_str,
            'qual_boxplot': qual_boxplot,
            'kmer': kmer_str,
            'dustscore': dust_str}
Esempio n. 8
0
    def test_quality_trimming(self):
        'It trims the edges'
        trim = TrimOrMask()

        trim_quality = TrimByQuality(window=5, threshold=30)

        seq = SeqRecord(Seq('ACTGCTGCATAAAA'))
        quals = [10, 10, 20, 30, 30, 30, 40, 40, 30, 30, 20, 20, 10, 10]
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        seqs = trim(trim_quality([seq]))
        assert get_qualities(seqs[0]) == [20, 30, 30, 30, 40, 40, 30, 30, 20]

        # all bad
        trim_quality = TrimByQuality(window=5, threshold=60)
        seqs = trim(trim_quality([seq]))
        assert not seqs

        # all OK
        trim_quality = TrimByQuality(window=5, threshold=5)
        seqs = trim(trim_quality([seq]))
        assert get_qualities(seqs[0]) == quals

        quals = [20, 20, 20, 60, 60, 60, 60, 60, 20, 20, 20, 20]
        trim_quality = TrimByQuality(window=5, threshold=50)
        seq = SeqRecord(Seq('ataataataata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        seqs = trim(trim_quality([seq]))
        expected = [20, 60, 60, 60, 60, 60, 20]
        assert get_qualities(seqs[0]) == expected

        quals = [40, 18, 10, 40, 40, 5, 8, 30, 14, 3, 40, 40, 40, 11, 6, 5, 3,
                 20, 10, 12, 8, 5, 4, 7, 1]
        seq = SeqRecord(Seq('atatatatagatagatagatagatg'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_quality = TrimByQuality(window=5, threshold=25)
        seqs = trim(trim_quality([seq]))
        assert get_qualities(seqs[0]) == [40, 18, 10, 40, 40]

        quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10,
                 10, 21, 3, 40, 9, 9, 12, 10, 9]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_quality = TrimByQuality(window=5, threshold=25)
        seqs = trim(trim_quality([seq]))
        expected = [40, 4, 27, 38, 40]
        assert get_qualities(seqs[0]) == expected

        quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10,
                 10, 21, 3, 40, 9, 9, 12, 10, 9]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_quality = TrimByQuality(window=5, threshold=25, trim_left=False)
        seqs = trim(trim_quality([seq]))
        expected = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40]
        assert get_qualities(seqs[0]) == expected

        quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10,
                 10, 21, 3, 40, 9, 9, 12, 10, 9]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False)
        seqs = trim(trim_quality([seq]))
        expected = [40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9,
                    12, 10, 9]
        assert get_qualities(seqs[0]) == expected

        quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10,
                 10, 21, 3, 40, 9, 9, 12, 10, 9]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False,
                                     trim_left=False)
        seqs = trim(trim_quality([seq]))
        expected = quals
        assert get_qualities(seqs[0]) == expected

        # With SeqItems
        seq = SeqItem('s', ['@s\n', 'atatatatatatatatatatatata\n', '\n',
                            'II.,I*I%<GI%,II++6$I**-+*\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        trim_quality = TrimByQuality(window=5, threshold=25, trim_right=True,
                                     trim_left=False)
        seqs = trim(trim_quality([seq]))
        expected = 'II.,I*I%<GI\n'
        assert seqs[0].object.lines[3] == expected
Esempio n. 9
0
    def test_quality_trimming(self):
        'It trims the edges'
        trim = TrimOrMask()

        trim_quality = TrimByQuality(window=5, threshold=30)

        seq = SeqRecord(Seq('ACTGCTGCATAAAA'))
        quals = [10, 10, 20, 30, 30, 30, 40, 40, 30, 30, 20, 20, 10, 10]
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_qualities(seq2) == [20, 30, 30, 30, 40, 40, 30, 30, 20]

        # all bad
        trim_quality = TrimByQuality(window=5, threshold=60)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        assert not trim_packet2[SEQS_PASSED]

        # all OK
        trim_quality = TrimByQuality(window=5, threshold=5)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_qualities(seq2) == quals

        seq = SeqRecord(Seq('ACTGCTGCATAA'))
        quals = [20, 20, 20, 60, 60, 60, 60, 60, 20, 20, 20, 20]

        trim_quality = TrimByQuality(window=5, threshold=50)
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_qualities(seq2) == [20, 60, 60, 60, 60, 60, 20]

        quals = [40, 18, 10, 40, 40, 5, 8, 30, 14, 3, 40, 40, 40, 11, 6, 5, 3,
                 20, 10, 12, 8, 5, 4, 7, 1]
        seq = SeqRecord(Seq('atatatatagatagatagatagatg'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_quality = TrimByQuality(window=5, threshold=25)
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_qualities(seq2) == [40, 18, 10, 40, 40]

        quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10,
                 10, 21, 3, 40, 9, 9, 12, 10, 9]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_quality = TrimByQuality(window=5, threshold=25)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}

        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        expected = [40, 4, 27, 38, 40]
        assert get_qualities(seq2) == expected

        quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10,
                 10, 21, 3, 40, 9, 9, 12, 10, 9]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_quality = TrimByQuality(window=5, threshold=25, trim_left=False)
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_qualities(seq2) == [40, 40, 13, 11, 40, 9, 40, 4, 27, 38,
                                       40]

        quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10,
                 10, 21, 3, 40, 9, 9, 12, 10, 9]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False)
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_qualities(seq2) == [40, 4, 27, 38, 40, 4, 11, 40, 40, 10,
                                       10, 21, 3, 40, 9, 9, 12, 10, 9]

        quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10,
                 10, 21, 3, 40, 9, 9, 12, 10, 9]
        seq = SeqRecord(Seq('atatatatatatatatatatatata'))
        seq.letter_annotations['phred_quality'] = quals
        seq = SeqWrapper(SEQRECORD, seq, None)
        trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False,
                                     trim_left=False)
        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert get_qualities(seq2) == quals

        # With SeqItems
        seq = SeqItem('s', ['@s\n', 'atatatatatatatatatatatata\n', '\n',
                            'II.,I*I%<GI%,II++6$I**-+*\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        trim_quality = TrimByQuality(window=5, threshold=25, trim_right=True,
                                     trim_left=False)

        trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []}
        trim_packet2 = trim(trim_quality(trim_packet))
        seq2 = trim_packet2[SEQS_PASSED][0][0]
        assert seq2.object.lines[3] == 'II.,I*I%<GI\n'