Esempio n. 1
0
    def test_quality_filter(self):
        'It filters the reads given a quality threshold'
        seq1 = SeqRecord(
            Seq('AAcTg'),
            id='seq1',
            letter_annotations={'phred_quality': [42, 42, 40, 42, 40]})
        seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None)
        seq2 = SeqRecord(
            Seq('AAcTg'),
            id='seq2',
            letter_annotations={'phred_quality': [40, 40, 42, 40, 42]})
        seq2 = SeqWrapper(object=seq2, kind=SEQRECORD, file_format=None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}

        filter_ = FilterByQuality(threshold=41)
        passed = _seqs_to_names(filter_(seqs)[SEQS_PASSED])
        assert passed == ['seq1']

        filter_ = FilterByQuality(threshold=41, reverse=True)
        passed = _seqs_to_names(filter_(seqs)[SEQS_PASSED])
        assert passed == ['seq2']

        filter_ = FilterByQuality(threshold=41.5, ignore_masked=True)
        passed = _seqs_to_names(filter_(seqs)[SEQS_PASSED])
        assert passed == ['seq1']
Esempio n. 2
0
    def test_dust_filter():
        'It tests the complexity filter'
        seq1 = 'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAAAAAAAAAAAAAAAAAAAAAAAA'
        seq2 = 'CATCGATTGCGATCGATCTTGTTGCACGACTAGCTATCGATTGCTAGCTTAGCTAGCTAGTT'
        seq1 = SeqRecord(Seq(seq1), id='seq1')
        seq2 = SeqRecord(Seq(seq2), id='seq2')
        seq1 = SeqWrapper(SEQRECORD, seq1, None)
        seq2 = SeqWrapper(SEQRECORD, seq2, None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}

        filter_dust = FilterDustComplexity()
        filter_packet = filter_dust(seqs)
        assert len(filter_packet[SEQS_PASSED]) == 1
        assert len(filter_packet[SEQS_FILTERED_OUT]) == 1

        assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq2'
        assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq1'

        # reverse
        filter_dust = FilterDustComplexity(reverse=True)
        filter_packet = filter_dust(seqs)
        assert len(filter_packet[SEQS_PASSED]) == 1
        assert len(filter_packet[SEQS_FILTERED_OUT]) == 1

        assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq1'
        assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq2'
Esempio n. 3
0
    def test_filter_by_read_count(self):
        seq1 = 'T' * 1000
        seq2 = 'A' * 1000
        seq1 = SeqRecord(Seq(seq1), id='seq1')
        seq2 = SeqRecord(Seq(seq2), id='seq2')
        seq1 = SeqWrapper(SEQRECORD, seq1, None)
        seq2 = SeqWrapper(SEQRECORD, seq2, None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}

        read_counts = {'seq1': {'mapped_reads': 10,
                                'unmapped_reads': 999989,
                                'length': len(seq1.object)},
                       'seq2': {'mapped_reads': 1, 'unmapped_reads': 0,
                                'length': len(seq2.object)}}
        filter_ = FilterByRpkm(read_counts, 2)
        seqs2 = filter_(seqs)
        assert _seqs_to_names(seqs2[SEQS_FILTERED_OUT]) == ['seq2']
        assert _seqs_to_names(seqs2[SEQS_PASSED]) == ['seq1']

        filter_ = FilterByRpkm(read_counts, 1)
        seqs2 = filter_(seqs)
        assert not seqs2[SEQS_FILTERED_OUT]

        filter_ = FilterByRpkm(read_counts, 2, reverse=True)
        seqs2 = filter_(seqs)
        assert _seqs_to_names(seqs2[SEQS_FILTERED_OUT]) == ['seq1']
        assert _seqs_to_names(seqs2[SEQS_PASSED]) == ['seq2']
Esempio n. 4
0
    def test_ns_filter():
        seq1 = 'N' * 50 + 'n' * 50 + '-' * 50 + '*' * 50
        seq2 = 'CATCGATTGCGATCGATCTTGTTGCACGACTAGCTATCGATTGCTAGCTTAGCTAGCTAGTT'
        seq1 = SeqRecord(Seq(seq1), id='seq1')
        seq2 = SeqRecord(Seq(seq2), id='seq2')
        seq1 = SeqWrapper(SEQRECORD, seq1, None)
        seq2 = SeqWrapper(SEQRECORD, seq2, None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}

        filter_ns = FilterAllNs()
        filter_packet = filter_ns(seqs)
        assert len(filter_packet[SEQS_PASSED]) == 1
        assert len(filter_packet[SEQS_FILTERED_OUT]) == 1

        assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq2'
        assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq1'

        # reverse
        filter_ns = FilterAllNs(reverse=True)
        filter_packet = filter_ns(seqs)
        assert len(filter_packet[SEQS_PASSED]) == 1
        assert len(filter_packet[SEQS_FILTERED_OUT]) == 1

        assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq1'
        assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq2'
Esempio n. 5
0
    def test_str_qualities(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        try:
            assert get_str_qualities(seq, 'fasta')
            self.fail('ValueError expected')
        except ValueError:
            pass

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_qualities(seq) == '!???'

        # with fastq to fastq-illumina
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_qualities(seq, ILLUMINA_QUALITY) == '@^^^'

        # with multiline fastq-illumina
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        assert get_str_qualities(seq, ILLUMINA_QUALITY) == '@AAABBBB'

        # with multiline fastq-illumina to fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        assert get_str_qualities(seq, 'fastq') == '!"""####'
Esempio n. 6
0
    def test_copy(self):
        # with fasta
        seq = SeqItem(name='s1',
                      lines=['>s1\n', 'ACTG\n', 'GTAC\n'],
                      annotations={'a': 'b'})
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        seq2 = copy_seq(seq, seq='ACTG')
        assert seq2.object == SeqItem(name='s1',
                                      lines=['>s1\n', 'ACTG\n'],
                                      annotations={'a': 'b'})
        assert seq.object is not seq2.object
        assert seq.object.lines is not seq2.object.lines

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq2 = copy_seq(seq, seq='ACTG')
        assert seq2.object == SeqItem(
            name='seq', lines=['@seq\n', 'ACTG\n', '+\n', '!???\n'])

        # with multiline fastq
        seq = SeqItem(
            name='seq',
            lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n', '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq2 = copy_seq(seq, seq='ACTGactg')
        assert seq2.object == SeqItem(
            name='seq', lines=['@seq\n', 'ACTGactg\n', '+\n', '@AAABBBB\n'])
Esempio n. 7
0
    def test_slice(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n'])
        expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta')
        assert slice_seq(seq, 1, 5) == expected_seq

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = slice_seq(seq, 1, 3)
        assert list(get_qualities(seq)) == [30, 0]
        assert get_str_seq(seq) == 'at'
        assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n']

        # with multiline fastq
        seq = SeqItem(
            name='seq',
            lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n', '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq_ = slice_seq(seq, 1, 5)
        assert list(get_qualities(seq_)) == [1, 1, 1, 2]
        assert get_str_seq(seq_) == get_str_seq(seq)[1:5]

        # It tests the stop is None
        seq = SeqItem('seq', ['>seq\n', 'aCTG'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:]

        assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
Esempio n. 8
0
def _build_some_paired_seqs():
    seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta')
    seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta')
    seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta')
    seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta')
    seqs = seq1, seq2, seq3, seq4
    return seqs
Esempio n. 9
0
    def test_seqitem_pairs_equal(self):
        seq1 = SeqWrapper(
            SEQITEM, SeqItem('seq1',
                             ['@seq1\n', 'TAATAC\n', '+\n', 'TTTDFG\n']),
            'fastq')
        seq2 = SeqWrapper(
            SEQITEM, SeqItem('seq2',
                             ['@seq2\n', 'TCATTA\n', '+\n', 'ABCBEG\n']),
            'fastq')
        seq3 = SeqWrapper(
            SEQITEM, SeqItem('seq3',
                             ['@seq3\n', 'TAATAC\n', '+\n', 'TTTDFG\n']),
            'fastq')
        seq4 = SeqWrapper(
            SEQITEM, SeqItem('seq4',
                             ['@seq4\n', 'ACGCGT\n', '+\n', 'ABCBEG\n']),
            'fastq')
        pair1 = (seq1, seq2)
        pair2 = (seq2, seq4)
        pair3 = (seq3, seq2)
        pair4 = (seq2, seq1)

        assert _seqitem_pairs_equal(pair1, pair3)
        assert not _seqitem_pairs_equal(pair1, pair2)
        assert not _seqitem_pairs_equal(pair1, pair4)
        assert _seqitem_pairs_equal([seq1], [seq3])
        assert not _seqitem_pairs_equal([seq1], [seq2])
        assert not _seqitem_pairs_equal([seq1], pair1)
        assert not _seqitem_pairs_equal(pair1, seq2)
Esempio n. 10
0
    def test_trim_seqs():
        'It tests the trim seq function'
        seqs = []
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)])

        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC', 'CTTC', 'CTC', 'AC']

        seqs = []
        seq = SeqItem('s', ['>s\n', 'aaCTTTC\n'])
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC']

        # with pairs
        seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n'])
        seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n'])
        seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n'])
        seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n'])
        seqs = []
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta'),
                     SeqWrapper(SEQITEM, seq1, 'fasta')])
        seqs.append([SeqWrapper(SEQITEM, seq2, 'fasta'),
                     SeqWrapper(SEQITEM, seq3, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['CTTTC']
        assert ['CTTTC', 'CTTTC'] == res

        # no drag
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['s1.r']
        assert ['CTTTC', 'CTTTC'] == res
Esempio n. 11
0
    def test_str_seq(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(seq) == 'ACTGGTAC'

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_seq(seq) == 'aaaa'
Esempio n. 12
0
 def test_pair_grouper():
     seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta')
     seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta')
     seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta')
     seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta')
     seqs = seq1, seq2, seq3, seq4
     paired_seqs = list(group_seqs_in_pairs(seqs))
     assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
     assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G']
     assert len(paired_seqs) == 2
Esempio n. 13
0
    def test_len(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_length(seq) == 8

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_length(seq) == 4
Esempio n. 14
0
 def _some_seqs(self):
     'It returns some seqrecords.'
     seqs = []
     seq = SeqRecord(Seq('ACCG'), letter_annotations={'dummy': 'dddd'})
     seq = SeqWrapper(SEQRECORD, seq, None)
     seqs.append([seq])
     seq = SeqRecord(Seq('AAACCCGGG'))
     seq = SeqWrapper(SEQRECORD, seq, None)
     seqs.append([seq])
     trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
     return trim_packet
Esempio n. 15
0
    def test_change_name(self):
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaa\n', '+seq\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = copy_seq(seq, name='seq2')
        assert seq.object == ('seq2', ['@seq2\n', 'aaaa\n', '+\n',
                                       '!???\n'], {})

        seq = SeqItem(name='seq', lines=['>seq\n', 'aaaa\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        seq = copy_seq(seq, name='seq2')
        assert seq.object == ('seq2', ['>seq2\n', 'aaaa\n'], {})
Esempio n. 16
0
    def test_filter_by_feat_type(self):
        orf = SeqFeature(FeatureLocation(3, 4), type='ORF')
        seq1 = SeqRecord(Seq('aaaa'), id='seq1', features=[orf])
        seq2 = SeqRecord(Seq('aaaa'), id='seq2')
        seq1 = SeqWrapper(SEQRECORD, seq1, None)
        seq2 = SeqWrapper(SEQRECORD, seq2, None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}

        filter_ = FilterByFeatureTypes(['ORF'])
        seqs = filter_(seqs)
        assert len(seqs[SEQS_FILTERED_OUT]) == 1
        assert len(seqs[SEQS_PASSED]) == 1
Esempio n. 17
0
def match_pairs_unordered(seq_fpath, out_fhand, orphan_out_fhand, out_format):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    index_ = _index_seq_file(seq_fpath)
    paired, orphans = _get_paired_and_orphan(index_)

    # write paired
    write_seqs((SeqWrapper(SEQRECORD, index_[title], None)
                for title in paired), out_fhand, out_format)

    # orphans
    write_seqs((SeqWrapper(SEQRECORD, index_[title], None)
                for title in orphans), orphan_out_fhand, out_format)
Esempio n. 18
0
    def test_with_pairs(self):
        seq1 = SeqRecord(Seq('ACTG'), id='seq1')
        seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None)
        seq2 = SeqRecord(Seq('ACTG'), id='seq2')
        seq2 = SeqWrapper(object=seq2, kind=SEQRECORD, file_format=None)
        seqs = {SEQS_PASSED: [[seq1, seq2]], SEQS_FILTERED_OUT: []}
        ids = ['seq1']

        filter_by_id = FilterById(ids, failed_drags_pair=True)
        passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED])
        assert not passed

        filter_by_id = FilterById(ids, failed_drags_pair=False)
        passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED])
        assert passed == ['seq1', 'seq2']
Esempio n. 19
0
    def test_no_name(self):
        seqs = _build_some_paired_seqs()
        seq = SeqWrapper(SEQITEM, SeqItem('s', ['>s\n', 'N\n']), 'fasta')

        seqs = seqs[0], seqs[1], seqs[2], seq, seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['G']

        seqs = _build_some_paired_seqs()
        seqs = seqs[0], seq, seqs[1], seqs[2], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['C']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['T', 'G']

        seqs = _build_some_paired_seqs()
        seqs = seq, seqs[0], seqs[1], seqs[2], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['T', 'G']
Esempio n. 20
0
    def test_blastmatch_filter():
        'it test filter by blast'
        blastdb = os.path.join(TEST_DATA_DIR, 'blastdbs', 'arabidopsis_genes')

        match = 'CCAAAGTACGGTCTCCCAAGCGGTCTCTTACCGGACACCGTCACCGATTTCACCCTCT'
        seq = 'ATCATGTAGTTACACATGAACACACACATG'
        seq += match
        seq1 = SeqRecord(Seq(seq), id='seq')
        seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None)
        seqs = {SEQS_PASSED: [[seq1]], SEQS_FILTERED_OUT: []}
        filters = [{'kind': 'score_threshold', 'score_key': 'expect',
                    'max_score': 0.001},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                    'min_score': 80},
                   {'kind': 'min_length', 'min_percentage': 60,
                    'length_in_query': True}]

        filter_ = FilterBlastMatch(blastdb, 'blastn', filters=filters,
                                   dbtype=NUCL)
        new_seqs = filter_(seqs)[SEQS_PASSED]
        assert new_seqs == []

        filters = [{'kind': 'score_threshold', 'score_key': 'expect',
                    'max_score': 1e-28}]
        filter_ = FilterBlastMatch(blastdb, 'blastn', filters)
        new_seqs = filter_(seqs)[SEQS_PASSED]
        assert len(new_seqs) == 1

        filters = [{'kind': 'score_threshold', 'score_key': 'expect',
                    'max_score': 1e-28}]
        filter_ = FilterBlastMatch(blastdb, 'blastn', filters, reverse=True)
        filter_packets = filter_(seqs)
        assert filter_packets[SEQS_PASSED] == []
        assert len(filter_packets[SEQS_FILTERED_OUT]) == 1
Esempio n. 21
0
    def test_seq_list_filter(self):
        'It filters the reads given a list of ids'
        seq1 = SeqRecord(Seq('ACTG'), id='seq1')
        seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None)
        seq2 = SeqRecord(Seq('ACTG'), id='seq2')
        seq2 = SeqWrapper(object=seq2, kind=SEQRECORD, file_format=None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}
        ids = ['seq1']
        filter_by_id = FilterById(ids)

        passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED])
        assert passed == ['seq1']

        filter_by_id = FilterById(set(ids), reverse=True)
        passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED])
        assert passed == ['seq2']
Esempio n. 22
0
    def test_blast_short_trimming(self):
        'It trims oligos using blast-short'

        oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG'))
        oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT'))
        oligo1 = SeqWrapper(SEQRECORD, oligo1, None)
        oligo2 = SeqWrapper(SEQRECORD, oligo2, None)

        adaptors = [oligo1, oligo2]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = read_seq_packets([fhand],
                                       prefered_seq_classes=[SEQRECORD])
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = blast_trim(trim_packets[0])
        # It should trim the first and the second reads.
        res = [
            get_annotations(s).get(TRIMMING_RECOMMENDATIONS,
                                   {}).get(VECTOR, [])
            for l in trim_packets2[SEQS_PASSED] for s in l
        ]
        assert res == [[(0, 29)], [(0, 29)], []]

        # With SeqItems
        oligo1 = SeqItem('oligo1',
                         ['>oligo1\n', 'AAGCAGTGGTATCAACGCAGAGTACATGGG\n'])
        oligo2 = SeqItem('oligo2',
                         ['>oligo2\n', 'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n'])
        oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta')
        oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta')

        adaptors = [oligo1, oligo2]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = list(
            read_seq_packets([fhand], prefered_seq_classes=[SEQITEM]))
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = blast_trim(trim_packets[0])
        # It should trim the first and the second reads.
        res = [
            get_annotations(s).get(TRIMMING_RECOMMENDATIONS,
                                   {}).get(VECTOR, [])
            for l in trim_packets2[SEQS_PASSED] for s in l
        ]
        assert res == [[(0, 29)], [(0, 29)], []]
Esempio n. 23
0
 def create_seq(index):
     'It creates a random seq with a linker'
     seq1 = ''.join(choice('ACTG') for i in range(100))
     seq2 = ''.join(choice('ACTG') for i in range(100))
     seq = seq1 + linker + seq2
     seq = SeqRecord(id='seq_' + str(index), seq=Seq(seq))
     seq = SeqWrapper(SEQRECORD, seq, None)
     return seq
Esempio n. 24
0
 def xtest_blaster(self):
     seq = 'GAGAAATTCCTTTGGAAGTTATTCCGTAGCATAAGAGCTGAAACTTCAGAGCAAGTTT'
     seq += 'TCATTGGGCAAAATGGGGGAACAACCTATCTTCAGCACTCGAGCTCATGTCTTCCAAATTGA'
     seq += 'CCCAAACACAAAGAAGAACTGGGTACCCACCAGCAAGCATGCAGTTACTGTGTCTTATTTCT'
     seq += 'ATGACAGCACAAGAAATGTGTATAGGATAATCAGTTTAGATGGCTCAAAGGCAATAATAAAT'
     seq += 'AGTACCATCACCCCAAACATGACA'
     seqrec = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None)
     blaster = Blaster([seqrec], 'nr', 'blastn', remote=True)
     print blaster.get_matched_segments('seq')
     assert blaster.get_matched_segments('seq') == [(1, 1740)]
Esempio n. 25
0
    def test_int_qualities(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        try:
            assert get_int_qualities(seq)
            self.fail('AttributeError expected')
        except AttributeError:
            pass

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert list(get_int_qualities(seq)) == [0, 30, 30, 30]

        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        assert list(get_int_qualities(seq)) == [0, 1, 1, 1, 2, 2, 2, 2]
Esempio n. 26
0
    def test_dustscore_calculation():
        'It calculates the dust score'
        seqs = ['TTTTTTTTTTTTTTTTTTTTTTTTTTTT', 'TATATATATATATATATATATATATATA',
                'GAAGAAGAAGAAGAAGAAGAAGAAGAAG', 'AACTGCAGTCGATGCTGATTCGATCGAT',
                'AACTGAAAAAAAATTTTTTTAAAAAAAA']

        # short sequences
        scores = [100, 48, 30.76, 4.31, 23.38]
        scoresx3 = [100, 48.68, 28.65, 5.62, 27.53]
        scoresx4 = [100, 48.55, 28.25, 5.79, 28.00]
        for seq, score, scorex3, scorex4 in zip(seqs, scores, scoresx3,
                                                scoresx4):
            seqrec = SeqRecord(Seq(seq))
            seqrec = SeqWrapper(SEQRECORD, seqrec, None)
            assert calculate_dust_score(seqrec) - score < 0.01
            seqrec = SeqRecord(Seq(seq * 3))
            seqrec = SeqWrapper(SEQRECORD, seqrec, None)
            assert calculate_dust_score(seqrec) - scorex3 < 0.01
            seqrec = SeqRecord(Seq(seq * 4))
            seqrec = SeqWrapper(SEQRECORD, seqrec, None)
            assert calculate_dust_score(seqrec) - scorex4 < 0.01
Esempio n. 27
0
 def test_bam_filter():
     'it test filter by being mapped in a BAM file'
     reads = [SeqRecord(seq=Seq('aaa'), id='seq{}'.format(n))
              for n in range(16, 23)]
     reads = [[SeqWrapper(SEQRECORD, r, None)] for r in reads]
     bam_fpath = os.path.join(TEST_DATA_DIR, 'seqs.bam')
     filter_ = FilterByBam([bam_fpath])
     filterpacket = {SEQS_PASSED: reads, SEQS_FILTERED_OUT: []}
     new_filterpacket = filter_(filterpacket)
     passed = _seqs_to_names(new_filterpacket[SEQS_PASSED])
     assert passed == ['seq16', 'seq17', 'seq18']
     filtered_out = _seqs_to_names(new_filterpacket[SEQS_FILTERED_OUT])
     assert filtered_out == ['seq19', 'seq20', 'seq21', 'seq22']
Esempio n. 28
0
    def test_blastmatch_filter():
        'it test filter by blast'
        seq = 'CCAAAGTACGGTCTCCCAAGCGGTCTCTTACCGGACACCGTCACCGATTTCACCCTCT'
        oligo = 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT'
        seq_oligo = seq + oligo
        oligo = SeqRecord(Seq(oligo))
        oligo = SeqWrapper(SEQRECORD, oligo, None)

        seq = SeqRecord(Seq(seq), id='seq')
        seq = SeqWrapper(object=seq, kind=SEQRECORD, file_format=None)

        seq_oligo = SeqRecord(Seq(seq_oligo), id='seq_oligo')
        seq_oligo = SeqWrapper(object=seq_oligo, kind=SEQRECORD,
                               file_format=None)

        seqs = {SEQS_PASSED: [[seq], [seq_oligo]], SEQS_FILTERED_OUT: []}

        filter_ = FilterBlastShort([oligo])
        filt_packet = filter_(seqs)
        passed = [get_name(pair[0]) for pair in filt_packet[SEQS_PASSED]]
        fail = [get_name(pair[0]) for pair in filt_packet[SEQS_FILTERED_OUT]]
        assert passed == ['seq']
        assert fail == ['seq_oligo']
Esempio n. 29
0
    def __call__(self, seqs):
        'It orientates seqs, that should have a SeqRecord in it'
        orientations = None
        orientation_log = [None] * len(seqs)

        for annotator in self._annotators:
            if orientations:
                to_annalyze = [not o for o in orientations]
                seqs_to_analyze = list(compress(seqs, to_annalyze))
            else:
                orientations = [None] * len(seqs)
                seqs_to_analyze = seqs

            annotator_name = annotator['name']
            blastdb = annotator.get('blastdb', None)
            annotator = self._get_annotator(annotator_name, blastdb)

            annot_seqrecords = annotator(seqs_to_analyze)
            annot_strands = self._guess_orientations(annot_seqrecords,
                                                     annotator_name,
                                                     blastdb=blastdb)

            if blastdb:
                annotator_name += ' ' + os.path.basename(blastdb)

            analyzed_seqs_index = 0
            for index, orientation in enumerate(orientations):
                if orientation is None:
                    orientations[index] = annot_strands[analyzed_seqs_index]
                    if annot_strands[analyzed_seqs_index] == -1:  # reverse
                        orientation_log[index] = annotator_name
                    analyzed_seqs_index += 1
        # Now we reverse the seqs that we have guess that are reversed
        reorientated_seqrecords = []
        for orientation, seq, reason in zip(orientations, seqs,
                                            orientation_log):
            if orientation == -1:
                rev_seqrecord = seq.object.reverse_complement(id=True,
                                                              description=True,
                                                              annotations=True,
                                                              features=True,
                                                              dbxrefs=True,
                                                              name=True)
                seq = SeqWrapper(SEQRECORD, rev_seqrecord, None)
                # we mark the reason why it has been reversed
                text = '(reversed because of: {})'.format(reason)
                append_to_description(seq, text)

            reorientated_seqrecords.append(seq)
        return reorientated_seqrecords
Esempio n. 30
0
def alignedread_to_seqitem(aligned_read, start_pos=0, end_pos=None):
    if aligned_read is None or aligned_read.seq is None:
        return None
    name = aligned_read.qname
    seq = aligned_read.seq[start_pos: end_pos]
    quals = aligned_read.qual
    if aligned_read.is_reverse:
        seq = _reverse(_complementary(seq))
    if quals is None:
        lines = ['>' + name + '\n', seq + '\n']
        file_format = 'fasta'
    else:
        quals = quals[start_pos: end_pos]
        if aligned_read.is_reverse:
            quals = _reverse(quals)
        lines = ['@' + name + '\n', seq + '\n', '+\n', quals + '\n']
        file_format = 'fastq'
    return SeqWrapper(SEQITEM, SeqItem(name, lines), file_format)