def test_slice(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n']) expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta') assert slice_seq(seq, 1, 5) == expected_seq # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = slice_seq(seq, 1, 3) assert list(get_qualities(seq)) == [30, 0] assert get_str_seq(seq) == 'at' assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n'] # with multiline fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n', '@AAA\n', 'BBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline') seq_ = slice_seq(seq, 1, 5) assert list(get_qualities(seq_)) == [1, 1, 1, 2] assert get_str_seq(seq_) == get_str_seq(seq)[1: 5] # It tests the stop is None seq = SeqItem('seq', ['>seq\n', 'aCTG']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:] assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
def test_slice(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n']) expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta') assert slice_seq(seq, 1, 5) == expected_seq # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = slice_seq(seq, 1, 3) assert list(get_qualities(seq)) == [30, 0] assert get_str_seq(seq) == 'at' assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n'] # with multiline fastq seq = SeqItem( name='seq', lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n', '@AAA\n', 'BBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline') seq_ = slice_seq(seq, 1, 5) assert list(get_qualities(seq_)) == [1, 1, 1, 2] assert get_str_seq(seq_) == get_str_seq(seq)[1:5] # It tests the stop is None seq = SeqItem('seq', ['>seq\n', 'aCTG']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:] assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
def test_qualities(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') try: assert get_qualities(seq) self.fail('AttributeError expected') except AttributeError: pass # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert list(get_qualities(seq)) == [0, 30, 30, 30] # with multiline fastq seq = SeqItem( name='seq', lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n', '@AAA\n', 'BBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline') assert list(get_qualities(seq)) == [0, 1, 1, 1, 2, 2, 2, 2]
def test_qualities(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') try: assert get_qualities(seq) self.fail('AttributeError expected') except AttributeError: pass # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert list(get_qualities(seq)) == [0, 30, 30, 30] # with multiline fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n', '@AAA\n', 'BBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline') assert list(get_qualities(seq)) == [0, 1, 1, 1, 2, 2, 2, 2]
def _do_trim(self, seq): 'It trims the masked segments of the seqrecords.' window = self.window threshold = self.threshold trim_left = self.trim_left trim_right = self.trim_right try: quals = list(get_qualities(seq)) except KeyError: msg = 'Some of the input sequences do not have qualities: {}' msg = msg.format(get_name(seq)) segments = _get_bad_quality_segments(quals, window, threshold, trim_left, trim_right) if segments is not None: _add_trim_segments(segments, seq, kind=QUALITY) return seq
def calculate_sequence_stats(seqs, kmer_size=None, do_dust_stats=False, nxs=None): 'It calculates some stats for the given seqs.' # get data lengths = IntCounter() quals_per_pos = IntBoxplot() nucl_freq = NuclFreqsPlot() kmer_counter = KmerCounter(kmer_size) if kmer_size else None dustscores = IntCounter() for seq in seqs: lengths[get_length(seq)] += 1 try: quals = get_qualities(seq) except AttributeError: quals = [] for index, qual in enumerate(quals): quals_per_pos.append(index + 1, qual) str_seq = get_str_seq(seq) for index, nucl in enumerate(str_seq): nucl_freq.append(index, nucl) if kmer_counter is not None: kmer_counter.count_seq(str_seq) if do_dust_stats: dustscore = calculate_dust_score(seq) if dustscore is not None: dustscores[int(dustscore)] += 1 lengths.update_labels({'sum': 'tot. residues', 'items': 'num. seqs.'}) # length distribution lengths_srt = 'Length stats and distribution.\n' lengths_srt += '------------------------------\n' nxs = sorted(nxs) if nxs else [] for nx in sorted(nxs): lengths_srt += 'N{:d}: {:d}\n'.format(nx, calculate_nx(lengths, nx)) lengths_srt += str(lengths) lengths_srt += '\n' # agregate quals if quals_per_pos: quals = quals_per_pos.aggregated_array quals.update_labels({'sum': None, 'items': 'tot. base pairs'}) q30 = quals.count_relative_to_value(30, operator.ge) / quals.count q30 *= 100 q20 = quals.count_relative_to_value(20, operator.ge) / quals.count q20 *= 100 # qual distribution qual_str = 'Quality stats and distribution.\n' qual_str += '-------------------------------\n' qual_str += 'Q20: {:.2f}\n'.format(q20) qual_str += 'Q30: {:.2f}\n'.format(q30) qual_str += str(quals) qual_str += '\n' # qual per position boxplot qual_boxplot = 'Boxplot for quality per position.\n' qual_boxplot += '---------------------------------\n' qual_boxplot += quals_per_pos.ascii_plot qual_boxplot += '\n' else: qual_str = '' qual_boxplot = '' # nucl freqs freq_str = 'Nucleotide frequency per position.\n' freq_str += '----------------------------------\n' freq_str += nucl_freq.ascii_plot freq_str += '\n' # kmer_distriubution kmer_str = '' if kmer_counter is not None: kmers = IntCounter(kmer_counter.values) if kmers: kmers.update_labels({'sum': None, 'items': 'num. kmers'}) kmer_str = 'Kmer distribution\n' kmer_str += '-----------------\n' kmer_str += str(kmers) kmer_str += '\n' kmer_str += 'Most common kmers:\n' for kmer, number in kmer_counter.most_common(20): kmer_str += '\t{}: {}\n'.format(kmer, number) dust_str = '' if dustscores: dustscores.update_labels({'sum': None, 'items': 'num. seqs.'}) dust_str = 'Dustscores stats and distribution.\n' dust_str += '----------------------------------\n' dust7 = (dustscores.count_relative_to_value(7, operator.gt) / dustscores.count) dust_str += '% above 7 (low complexity): {:.2f}\n'.format(dust7) dust_str += str(dustscores) dust_str += '\n' return {'length': lengths_srt, 'quality': qual_str, 'nucl_freq': freq_str, 'qual_boxplot': qual_boxplot, 'kmer': kmer_str, 'dustscore': dust_str}
def test_quality_trimming(self): 'It trims the edges' trim = TrimOrMask() trim_quality = TrimByQuality(window=5, threshold=30) seq = SeqRecord(Seq('ACTGCTGCATAAAA')) quals = [10, 10, 20, 30, 30, 30, 40, 40, 30, 30, 20, 20, 10, 10] seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) seqs = trim(trim_quality([seq])) assert get_qualities(seqs[0]) == [20, 30, 30, 30, 40, 40, 30, 30, 20] # all bad trim_quality = TrimByQuality(window=5, threshold=60) seqs = trim(trim_quality([seq])) assert not seqs # all OK trim_quality = TrimByQuality(window=5, threshold=5) seqs = trim(trim_quality([seq])) assert get_qualities(seqs[0]) == quals quals = [20, 20, 20, 60, 60, 60, 60, 60, 20, 20, 20, 20] trim_quality = TrimByQuality(window=5, threshold=50) seq = SeqRecord(Seq('ataataataata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) seqs = trim(trim_quality([seq])) expected = [20, 60, 60, 60, 60, 60, 20] assert get_qualities(seqs[0]) == expected quals = [40, 18, 10, 40, 40, 5, 8, 30, 14, 3, 40, 40, 40, 11, 6, 5, 3, 20, 10, 12, 8, 5, 4, 7, 1] seq = SeqRecord(Seq('atatatatagatagatagatagatg')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25) seqs = trim(trim_quality([seq])) assert get_qualities(seqs[0]) == [40, 18, 10, 40, 40] quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25) seqs = trim(trim_quality([seq])) expected = [40, 4, 27, 38, 40] assert get_qualities(seqs[0]) == expected quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25, trim_left=False) seqs = trim(trim_quality([seq])) expected = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40] assert get_qualities(seqs[0]) == expected quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False) seqs = trim(trim_quality([seq])) expected = [40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] assert get_qualities(seqs[0]) == expected quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False, trim_left=False) seqs = trim(trim_quality([seq])) expected = quals assert get_qualities(seqs[0]) == expected # With SeqItems seq = SeqItem('s', ['@s\n', 'atatatatatatatatatatatata\n', '\n', 'II.,I*I%<GI%,II++6$I**-+*\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') trim_quality = TrimByQuality(window=5, threshold=25, trim_right=True, trim_left=False) seqs = trim(trim_quality([seq])) expected = 'II.,I*I%<GI\n' assert seqs[0].object.lines[3] == expected
def test_quality_trimming(self): 'It trims the edges' trim = TrimOrMask() trim_quality = TrimByQuality(window=5, threshold=30) seq = SeqRecord(Seq('ACTGCTGCATAAAA')) quals = [10, 10, 20, 30, 30, 30, 40, 40, 30, 30, 20, 20, 10, 10] seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_qualities(seq2) == [20, 30, 30, 30, 40, 40, 30, 30, 20] # all bad trim_quality = TrimByQuality(window=5, threshold=60) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) assert not trim_packet2[SEQS_PASSED] # all OK trim_quality = TrimByQuality(window=5, threshold=5) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_qualities(seq2) == quals seq = SeqRecord(Seq('ACTGCTGCATAA')) quals = [20, 20, 20, 60, 60, 60, 60, 60, 20, 20, 20, 20] trim_quality = TrimByQuality(window=5, threshold=50) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_qualities(seq2) == [20, 60, 60, 60, 60, 60, 20] quals = [40, 18, 10, 40, 40, 5, 8, 30, 14, 3, 40, 40, 40, 11, 6, 5, 3, 20, 10, 12, 8, 5, 4, 7, 1] seq = SeqRecord(Seq('atatatatagatagatagatagatg')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_quality = TrimByQuality(window=5, threshold=25) trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_qualities(seq2) == [40, 18, 10, 40, 40] quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] expected = [40, 4, 27, 38, 40] assert get_qualities(seq2) == expected quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_quality = TrimByQuality(window=5, threshold=25, trim_left=False) trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_qualities(seq2) == [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40] quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False) trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_qualities(seq2) == [40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] quals = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False, trim_left=False) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_qualities(seq2) == quals # With SeqItems seq = SeqItem('s', ['@s\n', 'atatatatatatatatatatatata\n', '\n', 'II.,I*I%<GI%,II++6$I**-+*\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') trim_quality = TrimByQuality(window=5, threshold=25, trim_right=True, trim_left=False) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert seq2.object.lines[3] == 'II.,I*I%<GI\n'