Example #1
0
    def test_ns_filter():
        seq1 = 'N' * 50 + 'n' * 50 + '-' * 50 + '*' * 50
        seq2 = 'CATCGATTGCGATCGATCTTGTTGCACGACTAGCTATCGATTGCTAGCTTAGCTAGCTAGTT'
        seq1 = SeqRecord(Seq(seq1), id='seq1')
        seq2 = SeqRecord(Seq(seq2), id='seq2')
        seq1 = SeqWrapper(SEQRECORD, seq1, None)
        seq2 = SeqWrapper(SEQRECORD, seq2, None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}

        filter_ns = FilterAllNs()
        filter_packet = filter_ns(seqs)
        assert len(filter_packet[SEQS_PASSED]) == 1
        assert len(filter_packet[SEQS_FILTERED_OUT]) == 1

        assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq2'
        assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq1'

        # reverse
        filter_ns = FilterAllNs(reverse=True)
        filter_packet = filter_ns(seqs)
        assert len(filter_packet[SEQS_PASSED]) == 1
        assert len(filter_packet[SEQS_FILTERED_OUT]) == 1

        assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq1'
        assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq2'
Example #2
0
    def test_filter_by_read_count(self):
        seq1 = 'T' * 1000
        seq2 = 'A' * 1000
        seq1 = SeqRecord(Seq(seq1), id='seq1')
        seq2 = SeqRecord(Seq(seq2), id='seq2')
        seq1 = SeqWrapper(SEQRECORD, seq1, None)
        seq2 = SeqWrapper(SEQRECORD, seq2, None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}

        read_counts = {'seq1': {'mapped_reads': 10,
                                'unmapped_reads': 999989,
                                'length': len(seq1.object)},
                       'seq2': {'mapped_reads': 1, 'unmapped_reads': 0,
                                'length': len(seq2.object)}}
        filter_ = FilterByRpkm(read_counts, 2)
        seqs2 = filter_(seqs)
        assert _seqs_to_names(seqs2[SEQS_FILTERED_OUT]) == ['seq2']
        assert _seqs_to_names(seqs2[SEQS_PASSED]) == ['seq1']

        filter_ = FilterByRpkm(read_counts, 1)
        seqs2 = filter_(seqs)
        assert not seqs2[SEQS_FILTERED_OUT]

        filter_ = FilterByRpkm(read_counts, 2, reverse=True)
        seqs2 = filter_(seqs)
        assert _seqs_to_names(seqs2[SEQS_FILTERED_OUT]) == ['seq1']
        assert _seqs_to_names(seqs2[SEQS_PASSED]) == ['seq2']
Example #3
0
def _build_some_paired_seqs():
    seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta')
    seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta')
    seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta')
    seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta')
    seqs = seq1, seq2, seq3, seq4
    return seqs
Example #4
0
    def test_copy(self):
        # with fasta
        seq = SeqItem(name='s1',
                      lines=['>s1\n', 'ACTG\n', 'GTAC\n'],
                      annotations={'a': 'b'})
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        seq2 = copy_seq(seq, seq='ACTG')
        assert seq2.object == SeqItem(name='s1',
                                      lines=['>s1\n', 'ACTG\n'],
                                      annotations={'a': 'b'})
        assert seq.object is not seq2.object
        assert seq.object.lines is not seq2.object.lines

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq2 = copy_seq(seq, seq='ACTG')
        assert seq2.object == SeqItem(
            name='seq', lines=['@seq\n', 'ACTG\n', '+\n', '!???\n'])

        # with multiline fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        seq2 = copy_seq(seq, seq='ACTGactg')
        assert seq2.object == SeqItem(
            name='seq', lines=['@seq\n', 'ACTGactg\n', '+\n', '@AAABBBB\n'])
Example #5
0
    def test_seqitem_pairs_equal(self):
        seq1 = SeqWrapper(
            SEQITEM, SeqItem('seq1',
                             ['@seq1\n', 'TAATAC\n', '+\n', 'TTTDFG\n']),
            'fastq')
        seq2 = SeqWrapper(
            SEQITEM, SeqItem('seq2',
                             ['@seq2\n', 'TCATTA\n', '+\n', 'ABCBEG\n']),
            'fastq')
        seq3 = SeqWrapper(
            SEQITEM, SeqItem('seq3',
                             ['@seq3\n', 'TAATAC\n', '+\n', 'TTTDFG\n']),
            'fastq')
        seq4 = SeqWrapper(
            SEQITEM, SeqItem('seq4',
                             ['@seq4\n', 'ACGCGT\n', '+\n', 'ABCBEG\n']),
            'fastq')
        pair1 = (seq1, seq2)
        pair2 = (seq2, seq4)
        pair3 = (seq3, seq2)
        pair4 = (seq2, seq1)

        assert _seqitem_pairs_equal(pair1, pair3)
        assert not _seqitem_pairs_equal(pair1, pair2)
        assert not _seqitem_pairs_equal(pair1, pair4)
        assert _seqitem_pairs_equal([seq1], [seq3])
        assert not _seqitem_pairs_equal([seq1], [seq2])
        assert not _seqitem_pairs_equal([seq1], pair1)
        assert not _seqitem_pairs_equal(pair1, seq2)
Example #6
0
    def test_dust_filter():
        'It tests the complexity filter'
        seq1 = 'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAAAAAAAAAAAAAAAAAAAAAAAA'
        seq2 = 'CATCGATTGCGATCGATCTTGTTGCACGACTAGCTATCGATTGCTAGCTTAGCTAGCTAGTT'
        seq1 = SeqRecord(Seq(seq1), id='seq1')
        seq2 = SeqRecord(Seq(seq2), id='seq2')
        seq1 = SeqWrapper(SEQRECORD, seq1, None)
        seq2 = SeqWrapper(SEQRECORD, seq2, None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}

        filter_dust = FilterDustComplexity()
        filter_packet = filter_dust(seqs)
        assert len(filter_packet[SEQS_PASSED]) == 1
        assert len(filter_packet[SEQS_FILTERED_OUT]) == 1

        assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq2'
        assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq1'

        # reverse
        filter_dust = FilterDustComplexity(reverse=True)
        filter_packet = filter_dust(seqs)
        assert len(filter_packet[SEQS_PASSED]) == 1
        assert len(filter_packet[SEQS_FILTERED_OUT]) == 1

        assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq1'
        assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq2'
Example #7
0
    def test_str_qualities(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        try:
            assert get_str_qualities(seq, 'fasta')
            self.fail('ValueError expected')
        except ValueError:
            pass

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_qualities(seq) == '!???'

        # with fastq to fastq-illumina
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_qualities(seq, ILLUMINA_QUALITY) == '@^^^'

        # with multiline fastq-illumina
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        assert get_str_qualities(seq, ILLUMINA_QUALITY) == '@AAABBBB'

        # with multiline fastq-illumina to fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        assert get_str_qualities(seq, 'fastq') == '!"""####'
Example #8
0
    def test_slice(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n'])
        expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta')
        assert slice_seq(seq, 1, 5) == expected_seq

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = slice_seq(seq, 1, 3)
        assert list(get_int_qualities(seq)) == [30, 0]
        assert get_str_seq(seq) == 'at'
        assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n']

        # with multiline fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaatcaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        seq_ = slice_seq(seq, 1, 5)
        assert list(get_int_qualities(seq_)) == [1, 1, 1, 2]
        assert get_str_seq(seq_) == get_str_seq(seq)[1:5]

        # It tests the stop is None
        seq = SeqItem('seq', ['>seq\n', 'aCTG'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:]

        assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
Example #9
0
    def test_str_seq(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(seq) == 'ACTGGTAC'

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_seq(seq) == 'aaaa'
Example #10
0
 def _some_seqs(self):
     'It returns some seqrecords.'
     seqs = []
     seq = SeqRecord(Seq('ACCG'), letter_annotations={'dummy': 'dddd'})
     seq = SeqWrapper(SEQRECORD, seq, None)
     seqs.append([seq])
     seq = SeqRecord(Seq('AAACCCGGG'))
     seq = SeqWrapper(SEQRECORD, seq, None)
     seqs.append([seq])
     trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
     return trim_packet
Example #11
0
    def test_change_name(self):
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaa\n', '+seq\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = copy_seq(seq, name='seq2')
        assert seq.object == ('seq2', ['@seq2\n', 'aaaa\n', '+\n',
                                       '!???\n'], {})

        seq = SeqItem(name='seq', lines=['>seq\n', 'aaaa\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        seq = copy_seq(seq, name='seq2')
        assert seq.object == ('seq2', ['>seq2\n', 'aaaa\n'], {})
Example #12
0
    def test_filter_by_feat_type(self):
        orf = SeqFeature(FeatureLocation(3, 4), type='ORF')
        seq1 = SeqRecord(Seq('aaaa'), id='seq1', features=[orf])
        seq2 = SeqRecord(Seq('aaaa'), id='seq2')
        seq1 = SeqWrapper(SEQRECORD, seq1, None)
        seq2 = SeqWrapper(SEQRECORD, seq2, None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}

        filter_ = FilterByFeatureTypes(['ORF'])
        seqs = filter_(seqs)
        assert len(seqs[SEQS_FILTERED_OUT]) == 1
        assert len(seqs[SEQS_PASSED]) == 1
Example #13
0
    def test_with_pairs(self):
        seq1 = SeqRecord(Seq('ACTG'), id='seq1')
        seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None)
        seq2 = SeqRecord(Seq('ACTG'), id='seq2')
        seq2 = SeqWrapper(object=seq2, kind=SEQRECORD, file_format=None)
        seqs = {SEQS_PASSED: [[seq1, seq2]], SEQS_FILTERED_OUT: []}
        ids = ['seq1']

        filter_by_id = FilterById(ids, failed_drags_pair=True)
        passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED])
        assert not passed

        filter_by_id = FilterById(ids, failed_drags_pair=False)
        passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED])
        assert passed == ['seq1', 'seq2']
Example #14
0
    def test_no_name(self):
        seqs = _build_some_paired_seqs()
        seq = SeqWrapper(SEQITEM, SeqItem('s', ['>s\n', 'N\n']), 'fasta')

        seqs = seqs[0], seqs[1], seqs[2], seq, seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['G']

        seqs = _build_some_paired_seqs()
        seqs = seqs[0], seq, seqs[1], seqs[2], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['C']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['T', 'G']

        seqs = _build_some_paired_seqs()
        seqs = seq, seqs[0], seqs[1], seqs[2], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['T', 'G']
Example #15
0
    def test_blastmatch_filter():
        'it test filter by blast'
        blastdb = os.path.join(TEST_DATA_DIR, 'blastdbs', 'arabidopsis_genes')

        match = 'CCAAAGTACGGTCTCCCAAGCGGTCTCTTACCGGACACCGTCACCGATTTCACCCTCT'
        seq = 'ATCATGTAGTTACACATGAACACACACATG'
        seq += match
        seq1 = SeqRecord(Seq(seq), id='seq')
        seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None)
        seqs = {SEQS_PASSED: [[seq1]], SEQS_FILTERED_OUT: []}
        filters = [{'kind': 'score_threshold', 'score_key': 'expect',
                    'max_score': 0.001},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                    'min_score': 80},
                   {'kind': 'min_length', 'min_percentage': 60,
                    'length_in_query': True}]

        filter_ = FilterBlastMatch(blastdb, 'blastn', filters=filters,
                                   dbtype=NUCL)
        new_seqs = filter_(seqs)[SEQS_PASSED]
        assert new_seqs == []

        filters = [{'kind': 'score_threshold', 'score_key': 'expect',
                    'max_score': 1e-28}]
        filter_ = FilterBlastMatch(blastdb, 'blastn', filters)
        new_seqs = filter_(seqs)[SEQS_PASSED]
        assert len(new_seqs) == 1

        filters = [{'kind': 'score_threshold', 'score_key': 'expect',
                    'max_score': 1e-28}]
        filter_ = FilterBlastMatch(blastdb, 'blastn', filters, reverse=True)
        filter_packets = filter_(seqs)
        assert filter_packets[SEQS_PASSED] == []
        assert len(filter_packets[SEQS_FILTERED_OUT]) == 1
Example #16
0
    def test_seq_list_filter(self):
        'It filters the reads given a list of ids'
        seq1 = SeqRecord(Seq('ACTG'), id='seq1')
        seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None)
        seq2 = SeqRecord(Seq('ACTG'), id='seq2')
        seq2 = SeqWrapper(object=seq2, kind=SEQRECORD, file_format=None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}
        ids = ['seq1']
        filter_by_id = FilterById(ids)

        passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED])
        assert passed == ['seq1']

        filter_by_id = FilterById(set(ids), reverse=True)
        passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED])
        assert passed == ['seq2']
Example #17
0
    def test_blast_short_trimming(self):
        'It trims oligos using blast-short'

        oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG'))
        oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT'))
        oligo1 = SeqWrapper(SEQRECORD, oligo1, None)
        oligo2 = SeqWrapper(SEQRECORD, oligo2, None)

        adaptors = [oligo1, oligo2]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = read_seq_packets([fhand],
                                       prefered_seq_classes=[SEQRECORD])
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = blast_trim(trim_packets[0])
        # It should trim the first and the second reads.
        res = [
            get_annotations(s).get(TRIMMING_RECOMMENDATIONS,
                                   {}).get(VECTOR, [])
            for l in trim_packets2[SEQS_PASSED] for s in l
        ]
        assert res == [[(0, 29)], [(0, 29)], []]

        # With SeqItems
        oligo1 = SeqItem('oligo1',
                         ['>oligo1\n', 'AAGCAGTGGTATCAACGCAGAGTACATGGG\n'])
        oligo2 = SeqItem('oligo2',
                         ['>oligo2\n', 'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n'])
        oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta')
        oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta')

        adaptors = [oligo1, oligo2]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = list(
            read_seq_packets([fhand], prefered_seq_classes=[SEQITEM]))
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = blast_trim(trim_packets[0])
        # It should trim the first and the second reads.
        res = [
            get_annotations(s).get(TRIMMING_RECOMMENDATIONS,
                                   {}).get(VECTOR, [])
            for l in trim_packets2[SEQS_PASSED] for s in l
        ]
        assert res == [[(0, 29)], [(0, 29)], []]
Example #18
0
 def create_seq(index):
     'It creates a random seq with a linker'
     seq1 = ''.join(choice('ACTG') for i in range(100))
     seq2 = ''.join(choice('ACTG') for i in range(100))
     seq = seq1 + linker + seq2
     seq = SeqRecord(id='seq_' + str(index), seq=Seq(seq))
     seq = SeqWrapper(SEQRECORD, seq, None)
     return seq
Example #19
0
 def xtest_blaster(self):
     seq = 'GAGAAATTCCTTTGGAAGTTATTCCGTAGCATAAGAGCTGAAACTTCAGAGCAAGTTT'
     seq += 'TCATTGGGCAAAATGGGGGAACAACCTATCTTCAGCACTCGAGCTCATGTCTTCCAAATTGA'
     seq += 'CCCAAACACAAAGAAGAACTGGGTACCCACCAGCAAGCATGCAGTTACTGTGTCTTATTTCT'
     seq += 'ATGACAGCACAAGAAATGTGTATAGGATAATCAGTTTAGATGGCTCAAAGGCAATAATAAAT'
     seq += 'AGTACCATCACCCCAAACATGACA'
     seqrec = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None)
     blaster = Blaster([seqrec], 'nr', 'blastn', remote=True)
     print blaster.get_matched_segments('seq')
     assert blaster.get_matched_segments('seq') == [(1, 1740)]
Example #20
0
    def test_int_qualities(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        try:
            assert get_int_qualities(seq)
            self.fail('AttributeError expected')
        except AttributeError:
            pass

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert list(get_int_qualities(seq)) == [0, 30, 30, 30]

        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina')
        assert list(get_int_qualities(seq)) == [0, 1, 1, 1, 2, 2, 2, 2]
Example #21
0
 def test_bam_filter():
     'it test filter by being mapped in a BAM file'
     reads = [SeqRecord(seq=Seq('aaa'), id='seq{}'.format(n))
              for n in range(16, 23)]
     reads = [[SeqWrapper(SEQRECORD, r, None)] for r in reads]
     bam_fpath = os.path.join(TEST_DATA_DIR, 'seqs.bam')
     filter_ = FilterByBam([bam_fpath])
     filterpacket = {SEQS_PASSED: reads, SEQS_FILTERED_OUT: []}
     new_filterpacket = filter_(filterpacket)
     passed = _seqs_to_names(new_filterpacket[SEQS_PASSED])
     assert passed == ['seq16', 'seq17', 'seq18']
     filtered_out = _seqs_to_names(new_filterpacket[SEQS_FILTERED_OUT])
     assert filtered_out == ['seq19', 'seq20', 'seq21', 'seq22']
Example #22
0
    def __call__(self, seqs):
        'It orientates seqs, that should have a SeqRecord in it'
        orientations = None
        orientation_log = [None] * len(seqs)

        for annotator in self._annotators:
            if orientations:
                to_annalyze = [not o for o in orientations]
                seqs_to_analyze = list(compress(seqs, to_annalyze))
            else:
                orientations = [None] * len(seqs)
                seqs_to_analyze = seqs

            annotator_name = annotator['name']
            blastdb = annotator.get('blastdb', None)
            annotator = self._get_annotator(annotator_name, blastdb)

            annot_seqrecords = annotator(seqs_to_analyze)
            annot_strands = self._guess_orientations(annot_seqrecords,
                                                     annotator_name,
                                                     blastdb=blastdb)

            if blastdb:
                annotator_name += ' ' + os.path.basename(blastdb)

            analyzed_seqs_index = 0
            for index, orientation in enumerate(orientations):
                if orientation is None:
                    orientations[index] = annot_strands[analyzed_seqs_index]
                    if annot_strands[analyzed_seqs_index] == -1:  # reverse
                        orientation_log[index] = annotator_name
                    analyzed_seqs_index += 1
        # Now we reverse the seqs that we have guess that are reversed
        reorientated_seqrecords = []
        for orientation, seq, reason in zip(orientations, seqs,
                                            orientation_log):
            if orientation == -1:
                rev_seqrecord = seq.object.reverse_complement(id=True,
                                                              description=True,
                                                              annotations=True,
                                                              features=True,
                                                              dbxrefs=True,
                                                              name=True)
                seq = SeqWrapper(SEQRECORD, rev_seqrecord, None)
                # we mark the reason why it has been reversed
                text = '(reversed because of: {})'.format(reason)
                append_to_description(seq, text)

            reorientated_seqrecords.append(seq)
        return reorientated_seqrecords
Example #23
0
    def test_blastmatch_filter():
        'it test filter by blast'
        seq = 'CCAAAGTACGGTCTCCCAAGCGGTCTCTTACCGGACACCGTCACCGATTTCACCCTCT'
        oligo = 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT'
        seq_oligo = seq + oligo
        oligo = SeqRecord(Seq(oligo))
        oligo = SeqWrapper(SEQRECORD, oligo, None)

        seq = SeqRecord(Seq(seq), id='seq')
        seq = SeqWrapper(object=seq, kind=SEQRECORD, file_format=None)

        seq_oligo = SeqRecord(Seq(seq_oligo), id='seq_oligo')
        seq_oligo = SeqWrapper(object=seq_oligo, kind=SEQRECORD,
                               file_format=None)

        seqs = {SEQS_PASSED: [[seq], [seq_oligo]], SEQS_FILTERED_OUT: []}

        filter_ = FilterBlastShort([oligo])
        filt_packet = filter_(seqs)
        passed = [get_name(pair[0]) for pair in filt_packet[SEQS_PASSED]]
        fail = [get_name(pair[0]) for pair in filt_packet[SEQS_FILTERED_OUT]]
        assert passed == ['seq']
        assert fail == ['seq_oligo']
Example #24
0
    def test_quality_filter(self):
        'It filters the reads given a quality threshold'
        seq1 = SeqRecord(Seq('AAcTg'), id='seq1',
                         letter_annotations={'phred_quality':
                                             [42, 42, 40, 42, 40]})
        seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None)
        seq2 = SeqRecord(Seq('AAcTg'), id='seq2',
                         letter_annotations={'phred_quality':
                                             [40, 40, 42, 40, 42]})
        seq2 = SeqWrapper(object=seq2, kind=SEQRECORD, file_format=None)
        seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []}

        filter_ = FilterByQuality(threshold=41)
        passed = _seqs_to_names(filter_(seqs)[SEQS_PASSED])
        assert passed == ['seq1']

        filter_ = FilterByQuality(threshold=41, reverse=True)
        passed = _seqs_to_names(filter_(seqs)[SEQS_PASSED])
        assert passed == ['seq2']

        filter_ = FilterByQuality(threshold=41.5, ignore_masked=True)
        passed = _seqs_to_names(filter_(seqs)[SEQS_PASSED])
        assert passed == ['seq1']
Example #25
0
def alignedread_to_seqitem(aligned_read, start_pos=0, end_pos=None):
    if aligned_read is None or aligned_read.seq is None:
        return None
    name = aligned_read.qname
    seq = aligned_read.seq[start_pos: end_pos]
    quals = aligned_read.qual
    if aligned_read.is_reverse:
        seq = _reverse(_complementary(seq))
    if quals is None:
        lines = ['>' + name + '\n', seq + '\n']
        file_format = 'fasta'
    else:
        quals = quals[start_pos: end_pos]
        if aligned_read.is_reverse:
            quals = _reverse(quals)
        lines = ['@' + name + '\n', seq + '\n', '+\n', quals + '\n']
        file_format = 'fastq'
    return SeqWrapper(SEQITEM, SeqItem(name, lines), file_format)
Example #26
0
    def test_split_mate(self):
        'It tests the function that splits seqs using segments'
        # pylint: disable=W0212
        seq = 'aaatttccctt'
        seq = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None)
        # fake class to test
        splitter = MatePairSplitter([seq])
        # segment beginning
        seqs = splitter._split_by_mate_linker(seq, ([(0, 3)], False))
        assert get_str_seq(seqs[0]) == 'ttccctt'
        assert get_name(seqs[0]) == 'seq'

        # segment at end
        seqs = splitter._split_by_mate_linker(seq, ([(7, 10)], False))
        assert get_str_seq(seqs[0]) == 'aaatttc'
        assert get_name(seqs[0]) == 'seq'

        # segment in the middle
        seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], True))
        assert get_str_seq(seqs[0]) == 'aaat'
        assert get_str_seq(seqs[1]) == 'ctt'
        assert get_name(seqs[0]) == r'seq_pl\1'
        assert get_name(seqs[1]) == r'seq_pl\2'

        seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], False))
        assert get_name(seqs[0]) == r'seq\1'
        assert get_name(seqs[1]) == r'seq\2'

        seqs = splitter._split_by_mate_linker(seq, ([(4, 6), (8, 9)], False))
        assert get_str_seq(seqs[0]) == 'aaat'
        assert get_str_seq(seqs[1]) == 'c'
        assert get_str_seq(seqs[2]) == 't'
        assert get_name(seqs[0]) == 'seq_mlc.part1'

        # all sequence is linker
        seqs = splitter._split_by_mate_linker(seq, ([(0, 10)], False))
        assert not get_str_seq(seqs[0])

        # there's no segments
        seqs = splitter._split_by_mate_linker(seq, ([], False))
        assert get_name(seq) == get_name(seqs[0])
        assert get_str_seq(seq) == get_str_seq(seqs[0])
Example #27
0
    def __call__(self, snv):
        self._clean_filter(snv)
        # we have to make all the posible conbinations
        chrom = snv.chrom
        last_chrom = self._last_chrom
        if last_chrom is not None and chrom == self._last_chrom[0]:
            ref = last_chrom[1]
        else:
            ref = self.ref_index[snv.chrom]
            self._last_chrom = chrom, ref

        start, end = calculate_window(snv.pos, snv.end, self.window, len(ref))

        snv_win_seq = SeqWrapper(SEQRECORD, ref[start:end], None)
        score = calculate_dust_score(snv_win_seq)
        if score > self.threshold:
            snv.add_filter(self.name)

        self._scores.append(score)

        if self.return_modified_snv:
            return snv
Example #28
0
    def test_trimming(self):
        'The sequences are trimmed according to the recommendations.'
        seq1 = 'gggtctcatcatcaggg'.upper()
        seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}})
        seq = SeqWrapper(SEQRECORD, seq, None)
        seqs = [seq]
        trim_packet = {SEQS_PASSED: [seqs], ORPHAN_SEQS: []}

        trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS]
        seq_trimmer = TrimOrMask()

        trim_rec['vector'] = [(0, 3), (8, 13)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l]
        assert res == ['CTCA']

        trim_rec['vector'] = [(0, 0), (8, 13)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l]
        assert res == ['GGTCTCA']

        trim_rec['vector'] = [(0, 1), (8, 12)]
        trim_rec['quality'] = [(1, 8), (13, 17)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        assert not trim_packet2[SEQS_PASSED]

        trim_rec['vector'] = [(0, 0), (8, 13)]
        trim_rec['quality'] = []
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l]
        assert res == ['GGTCTCA']
        trim_packet2[SEQS_PASSED][0][0]
        assert TRIMMING_RECOMMENDATIONS not in get_annotations(
            trim_packet2[SEQS_PASSED][0][0])
Example #29
0
    def test_pair_direction_and_name(self):
        'it test the pair_name parser'
        title = 'seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG'
        name, dir_ = _parse_pair_direction_and_name_from_title(title)
        assert name == 'seq8:136:FC706VJ:2:2104:15343:197393'
        assert dir_ == FWD

        title = 'seq8:136:FC706VJ:2:2104:15343:197393/1'
        name, dir_ = _parse_pair_direction_and_name_from_title(title)
        assert name == 'seq8:136:FC706VJ:2:2104:15343:197393'
        assert dir_ == FWD

        title = 'seq8:136:FC706VJ:2:2104:15343:197393.f'
        name, dir_ = _parse_pair_direction_and_name_from_title(title)
        assert name == 'seq8:136:FC706VJ:2:2104:15343:197393'
        assert dir_ == FWD

        title = 'seq8:136:FC706VJ:2:2104:15343:197393.mp12'
        try:
            name, dir_ = _parse_pair_direction_and_name_from_title(title)
            self.fail()
        except PairDirectionError:
            pass

        title = r'seq8:136:FC706VJ:2:2104:15343:197393\1'
        name, dir_ = _parse_pair_direction_and_name_from_title(title)
        assert name == 'seq8:136:FC706VJ:2:2104:15343:197393'
        assert dir_ == FWD

        # With SeqRecord
        seq = SeqRecord(id=r'seq8:136:FC706VJ:2:2104:15343:197393\1',
                        seq=Seq('ACT'))
        name, dir_ = _parse_pair_direction_and_name(
            SeqWrapper(SEQRECORD, seq, None))
        assert name == 'seq8:136:FC706VJ:2:2104:15343:197393'
        assert dir_ == FWD
Example #30
0
def _create_seqrecord(string):
    'Given an string it returns a SeqRecord'
    # pylint: disable=W0612
    seq = SeqRecord(Seq(string),
                    id=''.join([choice(ascii_lowercase) for i in range(6)]))
    return SeqWrapper(kind=SEQRECORD, object=seq, file_format=None)