Example #1
0
    def test_classify_paired_reads(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        #Non chimeric
        query1 = '>seq1 1:N:0:GATCAG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n'
        query2 = '>seq1 2:N:0:GATCAG\nAGGAGGGATCGGGCACCCACGGCGCGGTAGACTGAGGCCTTCTCGAACT\n'
        #Chimeric
        query3 = '>seq2 1:N:0:GATCAG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC\n'
        query4 = '>seq2 2:N:0:GATCAG\nACGTGGATGCGGCGACGGCCCTACGGCACATACTGTTATTAGGGTCACT\n'
        #unknown
        query5 = '>seq3 1:N:0:GATCAG\nAGTGACCCTAATAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGT\n'
        query6 = '>seq3 2:N:0:GATCAG\nGTCGTGCGCAGCCATTGAGACCTTCCTAGGGTTTTCCCCATGGAATCGG\n'

        query = query1 + query2 + query5 + query6 + query3 + query4
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        bam_fhand = NamedTemporaryFile(suffix='.bam')
        extra_params = ['-a', '-M']
        bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name,
                              extra_params=extra_params)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname')
        result = classify_mapped_reads(bam_fhand, mate_distance=2000)
        for pair, kind in result:
            if kind == NON_CHIMERIC:
                assert 'seq1' in get_name(pair[0])
            elif kind == UNKNOWN:
                assert 'seq3' in get_name(pair[0])
            elif kind == CHIMERA:
                assert 'seq2' in get_name(pair[0])
            else:
                self.fail()
    def test_giuseppe_reads():
        'It splits some real reads'
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # reads 2
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20
    def test_giuseppe_reads():
        'It splits some real reads'
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # reads 2
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20
Example #4
0
    def test_sort_by_position_in_ref(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')

        #with fasta format
        query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n'
        query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n'
        query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n'
        query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n'
        query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n'
        query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names
        #it fails because bwa somehow gives a position to an unmapped seq

        #with fastq format
        query1 += '+\n??????????????????????????????????????????????????\n'
        query2 += '+\n??????????????????????????????????????????????????\n'
        query3 += '+\n??????????????????????????????????????????????????\n'
        query4 += '+\n??????????????????????????????????????????????????\n'
        query5 += '+\n??????????????????????????????????????????????????\n'
        query6 += '+\n??????????????????????????????????????????????????\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names

        #sort by sequence
        sorted_names = []
        for seq in sort_fastx_files([in_fhand],
                                    key='seq',
                                    directory=None,
                                    max_items_in_memory=None,
                                    tempdir=None):
            sorted_names.append(get_name(seq))
        expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4']
        assert sorted_names == expected_names
Example #5
0
    def test_sort_by_position_in_ref(self):
        reference = GENOME
        ref_fhand = NamedTemporaryFile()
        ref_fhand.write(reference)
        ref_fhand.flush()

        #with fasta format
        query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n'
        query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n'
        query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n'
        query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n'
        query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n'
        query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files([in_fhand], 'coordinate', ref_fhand.name):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names

        #with fastq format
        query1 += '+\n??????????????????????????????????????????????????\n'
        query2 += '+\n??????????????????????????????????????????????????\n'
        query3 += '+\n??????????????????????????????????????????????????\n'
        query4 += '+\n??????????????????????????????????????????????????\n'
        query5 += '+\n??????????????????????????????????????????????????\n'
        query6 += '+\n??????????????????????????????????????????????????\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files([in_fhand], 'coordinate', ref_fhand.name):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names

        #sort by sequence
        sorted_names = []
        for seq in sort_fastx_files([in_fhand], key='seq', directory=None,
                     max_items_in_memory=None, tempdir=None):
            sorted_names.append(get_name(seq))
        expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4']
        assert sorted_names == expected_names
Example #6
0
    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        seq_fhand = write_seqs(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 13
        filters = [{'kind': 'min_length', 'min_num_residues': min_len,
                    'length_in_query': False, 'filter_match_parts': True},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                   'min_score': min_identity}]

        matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers,
                                        program='blastn', filters=filters,
                                        params={'task': 'blastn-short'},
                                        elongate_for_global=True,
                                        seqs_type=NUCL)
        new_seqs = []
        for seq in seqs:
            segments = matcher.get_matched_segments_for_read(get_name(seq))
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seq, segments)
            else:
                split_seqs = [seq]
            for seq in split_seqs:
                new_seqs.append(seq)
        return new_seqs
    def test_many_reads(self):
        'It splits lots of reads to check that blast finds everything'

        linker = TITANIUM_LINKER

        def create_seq(index):
            'It creates a random seq with a linker'
            seq1 = ''.join(choice('ACTG') for i in range(100))
            seq2 = ''.join(choice('ACTG') for i in range(100))
            seq = seq1 + linker + seq2
            seq = SeqRecord(id='seq_' + str(index), seq=Seq(seq))
            seq = SeqWrapper(SEQRECORD, seq, None)
            return seq

        # We want to test that blast reports all reads
        packet_size = get_setting('PACKET_SIZE')
        default_blast_max_target_size = 500
        assert packet_size > default_blast_max_target_size
        seqs = [create_seq(i) for i in range(1000)]
        splitter = MatePairSplitter()

        for index, seq in enumerate(splitter(seqs)):
            seq_index = index // 2
            pair_index = (index % 2) + 1
            expected_id = 'seq_' + str(seq_index) + '\\' + str(pair_index)
            assert get_name(seq) == expected_id
Example #8
0
    def __call__(self, seqs):
        'It runs the actual annotations'
        if not seqs:
            return seqs
        pep_fhand = NamedTemporaryFile()
        dna_fhand = NamedTemporaryFile()
        _run_estscan(seqs, pep_fhand.name, dna_fhand.name,
                     self._usage_matrix)
        # now we read the result files
        estscan_result = _read_estcan_results(open(pep_fhand.name),
                                              open(dna_fhand.name))
        for seq in seqs:
            seq_name = get_name(seq)
            orfs = estscan_result.get(seq_name, {})
            feats = []
            for (start, end, strand), str_seqs in orfs.viewitems():
                start -= 1
                # end is fine  -- end[
                feat = SeqFeature(location=FeatureLocation(start, end, strand),
                                  type='ORF', qualifiers=str_seqs)
                feats.append(feat)
            if feats:
                seq.object.features.extend(feats)

        dna_fhand.close()
        pep_fhand.close()
        return seqs
Example #9
0
def match_pairs(
    reads,
    out_fhand,
    orphan_out_fhand,
    out_format,
    ordered=True,
    check_order_buffer_size=0,
    max_reads_memory=None,
    temp_dir=None,
):
    """It matches the seq pairs in an iterator and splits the orphan seqs."""
    counts = 0
    check_order_buffer = KeyedSet()
    for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory, temp_dir):
        if len(pair) == 1:
            write_seqs(pair, orphan_out_fhand, out_format)
            try:
                name = _parse_pair_direction_and_name(pair[0])[0]
            except PairDirectionError:
                name = get_name(pair[0])
            if ordered and counts < check_order_buffer_size:
                counts += 1
                if not check_order_buffer.check_add(name):
                    msg = "Reads are not ordered by pairs.Use unordered option"
                    raise ItemsNotSortedError(msg)
            elif ordered and counts >= check_order_buffer_size:
                if name in check_order_buffer:
                    msg = "Reads are not ordered by pairs.Use unordered option"
                    raise ItemsNotSortedError(msg)
        elif len(pair) == 2:
            write_seqs(pair, out_fhand, out_format)
    flush_fhand(orphan_out_fhand)
    flush_fhand(out_fhand)
Example #10
0
    def __call__(self, seqs):
        'It runs the actual annotations'
        if not seqs:
            return seqs
        pep_fhand = NamedTemporaryFile()
        dna_fhand = NamedTemporaryFile()
        _run_estscan(seqs, pep_fhand.name, dna_fhand.name,
                     self._usage_matrix)
        # now we read the result files
        estscan_result = _read_estcan_results(open(pep_fhand.name),
                                              open(dna_fhand.name))
        for seq in seqs:
            seq_name = get_name(seq)
            orfs = estscan_result.get(seq_name, {})
            feats = []
            for (start, end, strand), str_seqs in orfs.viewitems():
                start -= 1
                # end is fine  -- end[
                feat = SeqFeature(location=FeatureLocation(start, end, strand),
                                  type='ORF', qualifiers=str_seqs)
                feats.append(feat)
            if feats:
                seq.object.features.extend(feats)

        dna_fhand.close()
        pep_fhand.close()
        return seqs
Example #11
0
    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        seq_fhand = write_seqs(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 13
        filters = [{'kind': 'min_length', 'min_num_residues': min_len,
                    'length_in_query': False, 'filter_match_parts': True},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                   'min_score': min_identity}]

        matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers,
                                        program='blastn', filters=filters,
                                        params={'task': 'blastn-short'},
                                        elongate_for_global=True,
                                        seqs_type=NUCL)
        new_seqs = []
        for seq in seqs:
            segments = matcher.get_matched_segments_for_read(get_name(seq))
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seq, segments)
            else:
                split_seqs = [seq]
            for seq in split_seqs:
                new_seqs.append(seq)
        return new_seqs
Example #12
0
def append_to_description(seqrecord, text):
    'it appends the text to the seqrecord description'
    desc = get_description(seqrecord)
    if desc in (None, get_name(seqrecord), '<unknown description>'):
        desc = ''
    desc += text
    seqrecord.object.description = desc
    def test_many_reads(self):
        'It splits lots of reads to check that blast finds everything'

        linker = TITANIUM_LINKER

        def create_seq(index):
            'It creates a random seq with a linker'
            seq1 = ''.join(choice('ACTG') for i in range(100))
            seq2 = ''.join(choice('ACTG') for i in range(100))
            seq = seq1 + linker + seq2
            seq = SeqRecord(id='seq_' + str(index), seq=Seq(seq))
            seq = SeqWrapper(SEQRECORD, seq, None)
            return seq

        # We want to test that blast reports all reads
        packet_size = get_setting('PACKET_SIZE')
        default_blast_max_target_size = 500
        assert packet_size > default_blast_max_target_size
        seqs = [create_seq(i) for i in range(1000)]
        splitter = MatePairSplitter()

        for index, seq in enumerate(splitter(seqs)):
            seq_index = index // 2
            pair_index = (index % 2) + 1
            expected_id = 'seq_' + str(seq_index) + '\\' + str(pair_index)
            assert  get_name(seq) == expected_id
Example #14
0
def append_to_description(seqrecord, text):
    'it appends the text to the seqrecord description'
    desc = get_description(seqrecord)
    if desc in (None, get_name(seqrecord), '<unknown description>'):
        desc = ''
    desc += text
    seqrecord.object.description = desc
Example #15
0
def match_pairs(reads,
                out_fhand,
                orphan_out_fhand,
                out_format,
                ordered=True,
                check_order_buffer_size=0,
                max_reads_memory=None,
                temp_dir=None):
    '''It matches the seq pairs in an iterator and splits the orphan seqs.'''
    counts = 0
    check_order_buffer = KeyedSet()
    for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory,
                                       temp_dir):
        if len(pair) == 1:
            write_seqs(pair, orphan_out_fhand, out_format)
            try:
                name = _parse_pair_direction_and_name(pair[0])[0]
            except PairDirectionError:
                name = get_name(pair[0])
            if ordered and counts < check_order_buffer_size:
                counts += 1
                if not check_order_buffer.check_add(name):
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
            elif ordered and counts >= check_order_buffer_size:
                if name in check_order_buffer:
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
        elif len(pair) == 2:
            write_seqs(pair, out_fhand, out_format)
    flush_fhand(orphan_out_fhand)
    flush_fhand(out_fhand)
Example #16
0
    def test_sort_by_position_in_ref(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')

        #with fasta format
        query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n'
        query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n'
        query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n'
        query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n'
        query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n'
        query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names
        #it fails because bwa somehow gives a position to an unmapped seq

        #with fastq format
        query1 += '+\n??????????????????????????????????????????????????\n'
        query2 += '+\n??????????????????????????????????????????????????\n'
        query3 += '+\n??????????????????????????????????????????????????\n'
        query4 += '+\n??????????????????????????????????????????????????\n'
        query5 += '+\n??????????????????????????????????????????????????\n'
        query6 += '+\n??????????????????????????????????????????????????\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names

        #sort by sequence
        sorted_names = []
        for seq in sort_fastx_files([in_fhand], key='seq', directory=None,
                     max_items_in_memory=None, tempdir=None):
            sorted_names.append(get_name(seq))
        expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4']
        assert sorted_names == expected_names
Example #17
0
    def test_filter_chimeras(self):
        reference_seq = GENOME

        #Typic non chimeric
        query1 = '>seq1 1:Y:18:ATCACG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTA'
        query1 += 'CATTGAACTT\n'
        query2 = '>seq1 2:Y:18:ATCACG\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAG'
        query2 += 'GGTTGTAACG\n'

        #typic chimeric
        query3 = '>seq2 1:Y:18:ATCACG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGT'
        query3 += 'CTGCGATCCCTG'
        query3 += 'GGTAGACTGAGGCCTTCTCGAACTACAAATCATCACCAGACCATGTCCGA\n'
        query4 = '>seq2 2:Y:18:ATCACG\nTTAAGGCACGTACGGTACCTAAATCGGCCTGATGGTATT'
        query4 += 'GATGCTGAACTT'
        query4 += 'ATTGCGGCTCACACACCCCTACGTTACACGCAAATGCTGCCCGAAACGTTAT\n'

        #Unknown, 3' end does not map, impossible to know if it is chimeric
        query13 = '>seq7 1:Y:18:ATCACG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCT'
        query13 += 'ACATTGAACTT'
        query13 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query14 = '>seq7 2:Y:18:ATCACG\nATCATTGCATAAGTAACACTCAACCAACAGTGCTACAG'
        query14 += 'GGTTGTAACGCC'
        query14 += 'CCTCGAAGGTACCTTTGCCAGACTGGGCTACAGGACACCCAGTCTCCCGGGAGTCT\n'

        query = query1 + query2 + query3 + query4 + query13 + query14
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()
        ref_fhand = NamedTemporaryFile()
        ref_fhand.write(reference_seq)
        ref_fhand.flush()
        out_fhand = NamedTemporaryFile()
        chimeras_fhand = NamedTemporaryFile()
        unknown_fhand = NamedTemporaryFile()
        filter_chimeras(ref_fhand.name, out_fhand, chimeras_fhand, [in_fhand],
                        unknown_fhand)
        result = read_seqs([out_fhand])
        chimeric = read_seqs([chimeras_fhand])
        unknown = read_seqs([unknown_fhand])
        for seq in result:
            assert get_name(seq) in ['seq1.f', 'seq1.r']
        for seq in chimeric:
            assert get_name(seq) in ['seq2.f', 'seq2.r']
        for seq in unknown:
            assert get_name(seq) in ['seq7.f', 'seq7.r']
Example #18
0
    def _do_check(self, seq):
        count = self._read_counts[get_name(seq)]

        kb_len = count['length'] / 1000
        rpk = count['mapped_reads'] / kb_len  # rpks
        rpkm = rpk / self._million_reads  # rpkms

        return True if rpkm >= self._min_rpkm else False
Example #19
0
    def _do_check(self, seq):
        count = self._read_counts[get_name(seq)]

        kb_len = count['length'] / 1000
        rpk = count['mapped_reads'] / kb_len  # rpks
        rpkm = rpk / self._million_reads  # rpkms

        return True if rpkm >= self._min_rpkm else False
Example #20
0
    def test_trim_seqs():
        'It tests the trim seq function'
        seqs = []
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)])

        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC', 'CTTC', 'CTC', 'AC']

        seqs = []
        seq = SeqItem('s', ['>s\n', 'aaCTTTC\n'])
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC']

        # with pairs
        seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n'])
        seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n'])
        seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n'])
        seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n'])
        seqs = []
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta'),
                     SeqWrapper(SEQITEM, seq1, 'fasta')])
        seqs.append([SeqWrapper(SEQITEM, seq2, 'fasta'),
                     SeqWrapper(SEQITEM, seq3, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['CTTTC']
        assert ['CTTTC', 'CTTTC'] == res

        # no drag
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['s1.r']
        assert ['CTTTC', 'CTTTC'] == res
Example #21
0
    def test_trim_seqs():
        'It tests the trim seq function'
        seqs = []
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)])

        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC', 'CTTC', 'CTC', 'AC']

        seqs = []
        seq = SeqItem('s', ['>s\n', 'aaCTTTC\n'])
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC']

        # with pairs
        seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n'])
        seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n'])
        seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n'])
        seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n'])
        seqs = []
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta'),
                     SeqWrapper(SEQITEM, seq1, 'fasta')])
        seqs.append([SeqWrapper(SEQITEM, seq2, 'fasta'),
                     SeqWrapper(SEQITEM, seq3, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['CTTTC']
        assert ['CTTTC', 'CTTTC'] == res

        # no drag
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['s1.r']
        assert ['CTTTC', 'CTTTC'] == res
    def test_split_mate(self):
        'It tests the function that splits seqs using segments'
        # pylint: disable=W0212
        seq = 'aaatttccctt'
        seq = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None)
        # fake class to test
        splitter = MatePairSplitter([seq])
        # segment beginning
        seqs = splitter._split_by_mate_linker(seq, ([(0, 3)], False))
        assert get_str_seq(seqs[0]) == 'ttccctt'
        assert get_name(seqs[0]) == 'seq'

        # segment at end
        seqs = splitter._split_by_mate_linker(seq, ([(7, 11)], False))
        assert  get_str_seq(seqs[0]) == 'aaatttc'
        assert get_name(seqs[0]) == 'seq'

        # segmnent in the middle
        seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], True))
        assert get_str_seq(seqs[0]) == 'aaat'
        assert get_str_seq(seqs[1]) == 'ctt'
        assert get_name(seqs[0]) == 'seq_pl.part1'
        assert get_name(seqs[1]) == 'seq_pl.part2'

        seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], False))
        assert get_name(seqs[0]) == r'seq\1'
        assert get_name(seqs[1]) == r'seq\2'

        seqs = splitter._split_by_mate_linker(seq, ([(4, 6), (8, 9)],
                                                          False))
        assert get_str_seq(seqs[0]) == 'aaat'
        assert get_str_seq(seqs[1]) == 'c'
        assert get_str_seq(seqs[2]) == 't'
        assert get_name(seqs[0]) == 'seq_mlc.part1'

        # all sequence is linker
        seqs = splitter._split_by_mate_linker(seq, ([(0, 10)], False))
        assert not get_str_seq(seqs[0])

        # there's no segments
        seqs = splitter._split_by_mate_linker(seq, ([], False))
        assert get_name(seq) == get_name(seqs[0])
        assert get_str_seq(seq) == get_str_seq(seqs[0])
Example #23
0
    def __call__(self, seqrecords):
        'It does the work'
        if not seqrecords:
            return seqrecords
        matcher = Blaster(seqrecords,
                          self.blastdb,
                          self._program,
                          self._dbtype,
                          filters=self._filters,
                          params=self._params,
                          remote=self._remote)
        blasts = matcher.blasts
        blastdb = os.path.basename(self.blastdb)
        for seqrecord in seqrecords:
            align_result = blasts.get(get_name(seqrecord), None)
            if not align_result:
                continue
            match_counter = 0
            for match in align_result['matches']:
                subject = match['subject']['name']
                match_counter += 1
                for match_part in match['match_parts']:
                    if match_part['subject_end'] < match_part['subject_start']:
                        strand = -1
                        subject_start = match_part['subject_end']
                        subject_end = match_part['subject_start']
                    else:
                        strand = 1
                        subject_start = match_part['subject_start']
                        subject_end = match_part['subject_end']

                    query_start = match_part['query_start']
                    query_end = match_part['query_end']
                    qualifiers = {}
                    qualifiers['Target'] = {
                        'start': subject_start,
                        'end': subject_end,
                        'name': subject
                    }
                    qualifiers['score'] = match_part['scores']['expect']
                    qualifiers['identity'] = match_part['scores']['identity']
                    qualifiers['blastdb'] = blastdb
                    location = FeatureLocation(query_start, query_end, strand)
                    feature = SeqFeature(
                        location=location,
                        type='match_part',
                        qualifiers=qualifiers,
                        id='match{0:03d}'.format(match_counter))
                    seqrecord.object.features.append(feature)
        return seqrecords
Example #24
0
    def test_blastmatch_filter():
        'it test filter by blast'
        seq = 'CCAAAGTACGGTCTCCCAAGCGGTCTCTTACCGGACACCGTCACCGATTTCACCCTCT'
        oligo = 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT'
        seq_oligo = seq + oligo
        oligo = SeqRecord(Seq(oligo))
        oligo = SeqWrapper(SEQRECORD, oligo, None)

        seq = SeqRecord(Seq(seq), id='seq')
        seq = SeqWrapper(object=seq, kind=SEQRECORD, file_format=None)

        seq_oligo = SeqRecord(Seq(seq_oligo), id='seq_oligo')
        seq_oligo = SeqWrapper(object=seq_oligo, kind=SEQRECORD,
                               file_format=None)

        seqs = {SEQS_PASSED: [[seq], [seq_oligo]], SEQS_FILTERED_OUT: []}

        filter_ = FilterBlastShort([oligo])
        filt_packet = filter_(seqs)
        passed = [get_name(pair[0]) for pair in filt_packet[SEQS_PASSED]]
        fail = [get_name(pair[0]) for pair in filt_packet[SEQS_FILTERED_OUT]]
        assert passed == ['seq']
        assert fail == ['seq_oligo']
    def test_split_mate(self):
        'It tests the function that splits seqs using segments'
        # pylint: disable=W0212
        seq = 'aaatttccctt'
        seq = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None)
        # fake class to test
        splitter = MatePairSplitter([seq])
        # segment beginning
        seqs = splitter._split_by_mate_linker(seq, ([(0, 3)], False))
        assert get_str_seq(seqs[0]) == 'ttccctt'
        assert get_name(seqs[0]) == 'seq'

        # segment at end
        seqs = splitter._split_by_mate_linker(seq, ([(7, 11)], False))
        assert get_str_seq(seqs[0]) == 'aaatttc'
        assert get_name(seqs[0]) == 'seq'

        # segmnent in the middle
        seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], True))
        assert get_str_seq(seqs[0]) == 'aaat'
        assert get_str_seq(seqs[1]) == 'ctt'
        assert get_name(seqs[0]) == 'seq_pl.part1'
        assert get_name(seqs[1]) == 'seq_pl.part2'

        seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], False))
        assert get_name(seqs[0]) == r'seq\1'
        assert get_name(seqs[1]) == r'seq\2'

        seqs = splitter._split_by_mate_linker(seq, ([(4, 6), (8, 9)], False))
        assert get_str_seq(seqs[0]) == 'aaat'
        assert get_str_seq(seqs[1]) == 'c'
        assert get_str_seq(seqs[2]) == 't'
        assert get_name(seqs[0]) == 'seq_mlc.part1'

        # all sequence is linker
        seqs = splitter._split_by_mate_linker(seq, ([(0, 10)], False))
        assert not get_str_seq(seqs[0])

        # there's no segments
        seqs = splitter._split_by_mate_linker(seq, ([], False))
        assert get_name(seq) == get_name(seqs[0])
        assert get_str_seq(seq) == get_str_seq(seqs[0])
Example #26
0
 def _do_check(self, seq):
     seq_object = seq.object
     try:
         quals = seq_object.letter_annotations['phred_quality']
     except KeyError:
         msg = 'Some of the input sequences do not have qualities: {}'
         msg = msg.format(get_name(seq))
         raise WrongFormatError(msg)
     if self.ignore_masked:
         str_seq = str(seq_object.seq)
         seg_quals = [quals[segment[0]: segment[1] + 1]
                         for segment in get_uppercase_segments(str_seq)]
         qual = sum(sum(q) * len(q) for q in seg_quals) / len(quals)
     else:
         qual = sum(quals) / len(quals)
     return True if qual >= self.threshold else False
Example #27
0
 def _do_check(self, seq):
     seq_object = seq.object
     try:
         quals = seq_object.letter_annotations['phred_quality']
     except KeyError:
         msg = 'Some of the input sequences do not have qualities: {}'
         msg = msg.format(get_name(seq))
         raise WrongFormatError(msg)
     if self.ignore_masked:
         str_seq = str(seq_object.seq)
         seg_quals = [quals[segment[0]: segment[1] + 1]
                         for segment in get_uppercase_segments(str_seq)]
         qual = sum(sum(q) * len(q) for q in seg_quals) / len(quals)
     else:
         qual = sum(quals) / len(quals)
     return True if qual >= self.threshold else False
Example #28
0
    def _do_trim(self, seq):
        "It trims the masked segments of the seqrecords."
        window = self.window
        threshold = self.threshold
        trim_left = self.trim_left
        trim_right = self.trim_right
        try:
            quals = list(get_int_qualities(seq))
        except KeyError:
            msg = "Some of the input sequences do not have qualities: {}"
            msg = msg.format(get_name(seq))
        segments = _get_bad_quality_segments(quals, window, threshold, trim_left, trim_right)
        if segments is not None:
            _add_trim_segments(segments, seq, kind=QUALITY)

        return seq
Example #29
0
    def _do_trim(self, seq):
        'It trims the masked segments of the seqrecords.'
        window = self.window
        threshold = self.threshold
        trim_left = self.trim_left
        trim_right = self.trim_right
        try:
            quals = list(get_int_qualities(seq))
        except KeyError:
            msg = 'Some of the input sequences do not have qualities: {}'
            msg = msg.format(get_name(seq))
        segments = _get_bad_quality_segments(quals, window, threshold,
                                            trim_left, trim_right)
        if segments is not None:
            _add_trim_segments(segments, seq, kind=QUALITY)

        return seq
Example #30
0
 def __call__(self, seqs):
     'It trims the masked segments of the SeqWrappers.'
     db_fhand = write_seqs(seqs, file_format='fasta')
     db_fhand.flush()
     params = {'task': 'blastn-short', 'expect': '0.0001'}
     filters = [{'kind': 'score_threshold', 'score_key': 'identity',
                 'min_score': 89},
                {'kind': 'min_length', 'min_num_residues': 13,
                 'length_in_query': False}]
     matcher = BlasterForFewSubjects(db_fhand.name, self.oligos,
                                     program='blastn', filters=filters,
                                     params=params,
                                     elongate_for_global=True)
     for seq in seqs:
         segments = matcher.get_matched_segments_for_read(get_name(seq))
         if segments is not None:
             _add_trim_segments(segments[0], seq, kind=VECTOR)
     return seqs
Example #31
0
 def __call__(self, seqs):
     'It trims the masked segments of the seqrecords.'
     window = self.window
     threshold = self.threshold
     trim_left = self.trim_left
     trim_right = self.trim_right
     trimmed_seqs = []
     for seq in seqs:
         try:
             quals = list(get_qualities(seq))
         except KeyError:
             msg = 'Some of the input sequences do not have qualities: {}'
             msg = msg.format(get_name(seq))
         segments = _get_bad_quality_segments(quals, window, threshold,
                                             trim_left, trim_right)
         if segments is not None:
             _add_trim_segments(segments, seq, kind=QUALITY)
         trimmed_seqs.append(seq)
     return trimmed_seqs
Example #32
0
    def __call__(self, seqrecords):
        'It does the work'
        if not seqrecords:
            return seqrecords
        matcher = Blaster(seqrecords, self.blastdb, self._program,
                               self._dbtype, filters=self._filters,
                               params=self._params, remote=self._remote)
        blasts = matcher.blasts
        blastdb = os.path.basename(self.blastdb)
        for seqrecord in seqrecords:
            align_result = blasts.get(get_name(seqrecord), None)
            if not align_result:
                continue
            match_counter = 0
            for match in align_result['matches']:
                subject = match['subject']['name']
                match_counter += 1
                for match_part in  match['match_parts']:
                    if match_part['subject_end'] < match_part['subject_start']:
                        strand = -1
                        subject_start = match_part['subject_end']
                        subject_end = match_part['subject_start']
                    else:
                        strand = 1
                        subject_start = match_part['subject_start']
                        subject_end = match_part['subject_end']

                    query_start = match_part['query_start']
                    query_end = match_part['query_end']
                    qualifiers = {}
                    qualifiers['Target'] = {'start': subject_start,
                                            'end': subject_end,
                                            'name': subject}
                    qualifiers['score'] = match_part['scores']['expect']
                    qualifiers['identity'] = match_part['scores']['identity']
                    qualifiers['blastdb'] = blastdb
                    location = FeatureLocation(query_start, query_end, strand)
                    feature = SeqFeature(location=location, type='match_part',
                                         qualifiers=qualifiers,
                                       id='match{0:03d}'.format(match_counter))
                    seqrecord.object.features.append(feature)
        return seqrecords
Example #33
0
def _read_estcan_result(fhand, result, file_type):
    'It reads a dna or pep ESTscan result file'
    for seq in read_seqs([fhand], file_format='fasta'):
        items = [i.strip() for i in get_description(seq).split(';')]
        strand = -1 if 'minus strand' in items else 1
        start, end = items[0].split(' ', 3)[1:3]
        # estscan changes the name, we have to fix it
        seqid = get_name(seq).strip(';')
        try:
            seq_orfs = result[seqid]
        except KeyError:
            seq_orfs = {}
            result[seqid] = seq_orfs
        orf_key = (int(start), int(end), strand)
        if orf_key in seq_orfs:
            orf = seq_orfs[orf_key]
        else:
            orf = {}
            seq_orfs[orf_key] = orf
        orf[file_type] = get_str_seq(seq)
Example #34
0
def _read_estcan_result(fhand, result, file_type):
    'It reads a dna or pep ESTscan result file'
    for seq in read_seqs([fhand], file_format='fasta'):
        items = [i.strip() for i in get_description(seq).split(';')]
        strand = -1 if 'minus strand' in items else 1
        start, end = items[0].split(' ', 3)[1:3]
        # estscan changes the name, we have to fix it
        seqid = get_name(seq).strip(';')
        try:
            seq_orfs = result[seqid]
        except KeyError:
            seq_orfs = {}
            result[seqid] = seq_orfs
        orf_key = (int(start), int(end), strand)
        if orf_key in seq_orfs:
            orf = seq_orfs[orf_key]
        else:
            orf = {}
            seq_orfs[orf_key] = orf
        orf[file_type] = get_str_seq(seq)
Example #35
0
    def _get_chrom_lengths(self):
        chrom_lens = OrderedDict()
        if self._ref_fhand is None:
            vcf_fhand = gzip.open(self._reader.fhand.name)
            for line in vcf_fhand:
                line = line.strip()
                if line.startswith('#'):
                    continue
                items = line.split()
                chrom = items[0]
                loc = int(items[1])
                if chrom not in chrom_lens:
                    chrom_lens[chrom] = loc
                else:
                    if loc > chrom_lens[chrom]:
                        chrom_lens[chrom] = loc

        else:
            for read in read_seqs([self._ref_fhand]):
                chrom_lens[get_name(read)] = get_length(read)
        return chrom_lens
Example #36
0
 def _do_trim(self, seq):
     'It trims the masked segments of the SeqWrappers.'
     segments = self._matcher.get_matched_segments_for_read(get_name(seq))
     if segments is not None:
         _add_trim_segments(segments[0], seq, kind=VECTOR)
     return seq
Example #37
0
    def test_classify_paired_reads(self):
        reference_seq = GENOME
        #Typic non chimeric
        query1 = '>seq1 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n'
        query2 = '>seq1 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n'

        #typic chimeric
        query3 = '>seq2 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCCTG'
        query3 += 'GGTAGACTGAGGCCTTCTCGAACTACAAATCATCACCAGACCATGTCCGA\n'
        query4 = '>seq2 r\nTTAAGGCACGTACGGTACCTAAATCGGCCTGATGGTATTGATGCTGAACTT'
        query4 += 'ATTGCGGCTCACACACCCCTACGTTACACGCAAATGCTGCCCGAAACGTTAT\n'

        #PE-like chimera. 5' end does not map
        query5 = '>seq3 f\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
        query5 += 'AAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCCTG\n'
        query6 = '>seq3 r\nTTAAGGCACGTACGGTACCTAAATCGGCCTGATGGTATTGATGCTGAACTT'
        query6 += 'ATTGCGGCTCACACACCCCTACGTTACACGCAAATGCTGCCCGAAACGTTAT\n'

        #Non chimeric read fragmented into two different sequences
        query7 = '>seq4 f\nCAAATCATCACCAGACCATGTCCGATCCCGGGAGTCTTTTCCAAGGTGTGC'
        #first part of f sequence not detected -> unknown instead of mapped
        query7 += 'TCTTTATCCGGCCCTTGCTCAAGGGTATGTTAAAACGGCAAGAGCTGCCTGAGCGCG\n'
        query8 = '>seq4 r\nTGTTCTGCAATCGATACAACGATCGAATTTAATCTGAGTAACTGCCAATTC'
        query8 += 'TGAGTAATATTATAGAAAGT\n'

        #Chimeric reads mapping different reference sequence
        query9 = '>seq5 f\nTTTATCCGGCCCTTGCTCAAGGGTATGTTAAAACGGCAAGAGCTGC'
        query9 += 'CTGAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCCTGTGG\n'
        query10 = '>seq5 r\nACTTATTGCGGCTCACACACCCCTACGTTACACGCAAATGCTGCCCGAAA'
        query10 += 'CGTTATCTGCGGTGAAATGATGTTCGCGGAGCTGACTATCGTCGCCTGATGATAAG\n'

        query11 = '>seq6 f\nACGCACTGATTGTGCTAGGGCCACAGTAGCGGAGATGATTAAGCAGCGAC'
        query11 += 'AACTACAAATCATCACCAGACCATGTCCGATCCCGGGAGTCTTTTCCAAGGTGTGC\n'
        query12 = '>seq6 r\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
        query12 += 'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n'

        #Unknown, 3' end does not map, impossible to know if it is chimeric
        query13 = '>seq7 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query13 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query14 = '>seq7 r\nATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACGCC'
        query14 += 'CCTCGAAGGTACCTTTGCCAGACTGGGCTACAGGACACCCAGTCTCCCGGGAGTCT\n'

        #chimeric sequences with wrong direction
        query15 = '>seq8 f\nTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGCAACTTCGTCTCTCCA'
        query15 += 'ATCAGCTACCGAATTGGGACCTCTACGGGAGTATGGAACGATTGA\n'
        query16 = '>seq8 r\nAGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query16 += 'GATGATTTGTAGTTCGAGAAGGCCTCAGTCTACCGCGCCGTGGGTGCCCGATCCCT\n'

        #chimeric sequences with wrong direction
        query18 = '>seq9 r\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCCT'
        query18 += 'GTGGACTTTCTATAATATTACTCAGAATTGGCAGTTACTCAGATTAAATTCG\n'
        query17 = '>seq9 f\nGCACACCTTGGAAAAGACTCCCGGGATCGGACATGGTCTGGTGATGATTT'
        query17 += 'GTAGTTCGAGAAGGCCTCAGTCTACCGCGCCGTGGGTGCCCGATCCCTCCTCTAGC\n'

        #Unknown, wrong relative positions <== =    =>
        query19 = '>seq10 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query19 += '\n'
        query20 = '>seq10 r\nATGTAATACGGGCTAGCCGGGGATGCCGACGATTAAACACGCTGTCATA'
        query20 += 'GTAGCGTCTTCCTAGGGTTTTCCCCATGGAATCGGTTATCGTGATACGTTAAATTT\n'

        #Unknown, wrong relative positions ==> <=    =
        query21 = '>seq11 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC'
        query21 += '\n'
        query22 = '>seq11 r\nAAATTTAACGTATCACGATAACCGATTCCATGGGGAAAACCCTAGGAAG'
        query22 += 'ACGCTACTATGACAGCGTGTTTAATCGTCGGCATCCCCGGCTAGCCCGTATTACAT\n'

        forward = query1 + query3 + query5 + query7 + query9 + query11
        forward += query13 + query15 + query17 + query19 + query21
        reverse = query2 + query4 + query6 + query8 + query10 + query12
        reverse += query14 + query16 + query18 + query20 + query22

        f_fhand = NamedTemporaryFile()
        f_fhand.write(forward)
        f_fhand.flush()
        r_fhand = NamedTemporaryFile()
        r_fhand.write(reverse)
        r_fhand.flush()
        paired_fpaths = [f_fhand.name, r_fhand.name]
        ref_fhand = NamedTemporaryFile()
        ref_fhand.write(reference_seq)
        ref_fhand.flush()

        #Kind is given per pair of mates
        bamfile = _sorted_mapped_reads(ref_fhand.name,
                                       paired_fpaths=paired_fpaths)
        result = classify_mapped_reads(bamfile, file_format='fasta')
        mapped = [['seq1.f', 'seq1.r'], ['seq4.f', 'seq4.r']]
        non_contiguous = [['seq2.f', 'seq2.r'], ['seq3.f', 'seq3.r'],
                          ['seq5.f', 'seq5.r'], ['seq6.f', 'seq6.r'],
                          ['seq10.f', 'seq10.r'], ['seq11.f', 'seq11.r'],
                          ['seq8.f', 'seq8.r']]
        unknown = [['seq7.f', 'seq7.r'],
                   ['seq9.f', 'seq9.r'], ['seq4.f', 'seq4.r']]
        expected = {'non_chimeric': mapped, 'chimera': non_contiguous,
                    'unknown': unknown}
        for pair in result:
            try:
                names = [get_name(read) for read in pair[0]]
                assert names in expected[pair[1]]
            except AssertionError:
                str_names = ' '.join(names)
                msg = str_names + ' not expected to be '
                msg += pair[1]
                raise AssertionError(msg)
Example #38
0
 def _do_check(self, seq):
     return True if get_name(seq) in self.seq_ids else False
Example #39
0
from crumbs.filters import (FilterByLength, FilterById, FilterByQuality,
                            FilterBlastMatch, FilterDustComplexity,
                            seq_to_filterpackets, FilterByRpkm, FilterByBam,
                            FilterBowtie2Match, FilterByFeatureTypes)
from crumbs.utils.bin_utils import BIN_DIR
from crumbs.utils.test_utils import TEST_DATA_DIR
from crumbs.utils.tags import (NUCL, SEQS_FILTERED_OUT, SEQS_PASSED, SEQITEM,
                               SEQRECORD)
from crumbs.utils.file_utils import TemporaryDir
from crumbs.seq import get_name, get_str_seq, SeqWrapper
from crumbs.mapping import get_or_create_bowtie2_index
from crumbs.seqio import read_seq_packets


_seqs_to_names = lambda seqs: [get_name(s) for pair in seqs for s in pair]
_seqs_to_str_seqs = lambda seqs: [get_str_seq(s) for pai in seqs for s in pai]


class PacketConversionTest(unittest.TestCase):
    'It tests the seqs and filter packet conversion'
    def test_seqs_to_filter_packets(self):
        'It converts seq packets into filter packets'
        seqpackets = [['ACT'], ['CTG', 'TTT']]
        filter_packets = list(seq_to_filterpackets(iter(seqpackets)))
        expected = [[['ACT']], [['CTG'], ['TTT']]]
        assert [p[SEQS_PASSED] for p in filter_packets] == expected
        assert [p[SEQS_FILTERED_OUT] for p in filter_packets] == [[], []]


def _create_seqrecord(string):
Example #40
0
 def _do_check(self, seq):
     return False if get_name(seq) in self.mapped_reads else True
Example #41
0
 def _do_check(self, seq):
     segments = self._matcher.get_matched_segments(get_name(seq))
     return True if segments is None else False
Example #42
0
from Bio.SeqFeature import SeqFeature, FeatureLocation

from crumbs.filters import (FilterByLength, FilterById, FilterByQuality,
                            FilterBlastMatch, FilterBlastShort,
                            FilterDustComplexity, seq_to_filterpackets,
                            FilterByRpkm, FilterByBam, FilterAllNs,
                            FilterBowtie2Match, FilterByFeatureTypes)
from crumbs.utils.bin_utils import BIN_DIR
from crumbs.utils.test_utils import TEST_DATA_DIR
from crumbs.utils.tags import (NUCL, SEQS_FILTERED_OUT, SEQS_PASSED, SEQITEM,
                               SEQRECORD)
from crumbs.seq import get_name, get_str_seq, SeqWrapper
from crumbs.seqio import read_seq_packets


_seqs_to_names = lambda seqs: [get_name(s) for pair in seqs for s in pair]
_seqs_to_str_seqs = lambda seqs: [get_str_seq(s) for pai in seqs for s in pai]


class PacketConversionTest(unittest.TestCase):
    'It tests the seqs and filter packet conversion'
    def test_seqs_to_filter_packets(self):
        'It converts seq packets into filter packets'
        seqpackets = [['ACT'], ['CTG', 'TTT']]
        filter_packets = list(seq_to_filterpackets(iter(seqpackets)))
        expected = [[('ACT',)], [('CTG',), ('TTT',)]]
        assert [p[SEQS_PASSED] for p in filter_packets] == expected
        assert [p[SEQS_FILTERED_OUT] for p in filter_packets] == [[], []]


def _create_seqrecord(string):
Example #43
0
 def _do_check(self, seq):
     segments = self._matcher.get_matched_segments(get_name(seq))
     return True if segments is None else False
Example #44
0
 def _do_check(self, seq):
     return True if get_name(seq) in self.seq_ids else False
Example #45
0
 def _do_check(self, seq):
     return False if get_name(seq) in self.mapped_reads else True
Example #46
0
def _get_seq_lengths(fhand):
    return {get_name(seq): get_length(seq) for seq in read_seqs([fhand])}
Example #47
0
class MatePairSplitter(object):
    'It splits the input sequences with the provided linkers.'

    def __init__(self, linkers=None):
        'The initiator'
        if linkers is None:
            linkers = get_setting('LINKERS')
            linkers = [SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers)]
            linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta')
        self.linkers = list(linkers)

    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        seq_fhand = write_seqs(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 13
        filters = [{'kind': 'min_length', 'min_num_residues': min_len,
                    'length_in_query': False, 'filter_match_parts': True},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                   'min_score': min_identity}]

        matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers,
                                        program='blastn', filters=filters,
                                        params={'task': 'blastn-short'},
                                        elongate_for_global=True,
                                        seqs_type=NUCL)
        new_seqs = []
        for seq in seqs:
            segments = matcher.get_matched_segments_for_read(get_name(seq))
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seq, segments)
            else:
                split_seqs = [seq]
            for seq in split_seqs:
                new_seqs.append(seq)
        return new_seqs

    def _split_by_mate_linker(self, seq, (segments, is_partial)):
        'It splits the seqs using segments'

        if not segments:
            return [copy_seq(seq)]

        elongated_match = is_partial
        if len(segments) == 1:
            segment_start = segments[0][0]
            segment_end = segments[0][1]
            seq_end = get_length(seq) - 1
            if segment_start == 0:
                new_seq = slice_seq(seq, segment_end + 1, None)
                return [new_seq]
            elif segment_end == seq_end:
                new_seq = slice_seq(seq, None, segment_start)
                return [new_seq]
            elif segment_end > seq_end:
                msg = 'The segment ends after the sequence has ended'
                raise RuntimeError(msg)
            else:
                new_seq1 = slice_seq(seq, None, segment_start)
                new_seq2 = slice_seq(seq, segment_end + 1, None)
                if elongated_match:
                    name = get_name(seq) + '_pl'
                else:
                    name = get_name(seq)
                new_seq1 = copy_seq(new_seq1, name=name + r'\1')
                new_seq2 = copy_seq(new_seq2, name=name + r'\2')
                return [new_seq1, new_seq2]
        else:
            seqs = []
            counter = 1
            seq_start = 0
            for segment_start, segment_end in segments:
                if segment_start == 0:
                    continue
                new_seq = slice_seq(seq, seq_start, segment_start)
                seq_name = get_name(seq) + '_mlc.part{0:d}'.format(counter)
                new_seq = copy_seq(new_seq, name=seq_name)
                seqs.append(new_seq)
                counter += 1
                seq_start = segment_end + 1
            else:
                if segment_end != get_length(seq) + 1:
                    new_seq = slice_seq(seq, segment_end + 1, None)
                    name = get_name(seq) + '_mlc.part{0:d}'.format(counter)
                    new_seq = copy_seq(new_seq, name=name)
                    seqs.append(new_seq)
            return seqs
Example #48
0
 def _do_trim(self, seq):
     'It trims the masked segments of the SeqWrappers.'
     segments = self._matcher.get_matched_segments_for_read(get_name(seq))
     if segments is not None:
         _add_trim_segments(segments[0], seq, kind=VECTOR)
     return seq