Esempio n. 1
0
def match_pairs(reads,
                out_fhand,
                orphan_out_fhand,
                out_format,
                ordered=True,
                check_order_buffer_size=0,
                max_reads_memory=None,
                temp_dir=None):
    '''It matches the seq pairs in an iterator and splits the orphan seqs.'''
    counts = 0
    check_order_buffer = KeyedSet()
    for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory,
                                       temp_dir):
        if len(pair) == 1:
            write_seqs(pair, orphan_out_fhand, out_format)
            try:
                name = _parse_pair_direction_and_name(pair[0])[0]
            except PairDirectionError:
                name = get_name(pair[0])
            if ordered and counts < check_order_buffer_size:
                counts += 1
                if not check_order_buffer.check_add(name):
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
            elif ordered and counts >= check_order_buffer_size:
                if name in check_order_buffer:
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
        elif len(pair) == 2:
            write_seqs(pair, out_fhand, out_format)
    flush_fhand(orphan_out_fhand)
    flush_fhand(out_fhand)
Esempio n. 2
0
def match_pairs(reads, out_fhand, orphan_out_fhand, out_format, ordered=True,
                check_order_buffer_size=0, max_reads_memory=None,
                temp_dir=None):
    '''It matches the seq pairs in an iterator and splits the orphan seqs.'''
    counts = 0
    check_order_buffer = KeyedSet()
    for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory,
                                       temp_dir):
        if len(pair) == 1:
            write_seqs(pair, orphan_out_fhand, out_format)
            try:
                name = _parse_pair_direction_and_name(pair[0])[0]
            except PairDirectionError:
                name = get_name(pair[0])
            if ordered and counts < check_order_buffer_size:
                counts += 1
                if not check_order_buffer.check_add(name):
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
            elif ordered and counts >= check_order_buffer_size:
                if name in check_order_buffer:
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
        elif len(pair) == 2:
            write_seqs(pair, out_fhand, out_format)
    flush_fhand(orphan_out_fhand)
    flush_fhand(out_fhand)
Esempio n. 3
0
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format):
    '''It splits a sequence iterator with alternating paired reads in two.

    It will fail if forward and reverse reads are not alternating.
    '''
    for pair in group_pairs(seqs, n_seqs_in_pair=2):
        write_seqs((pair[0], ), out_fhand1, out_format)
        write_seqs((pair[1], ), out_fhand2, out_format)
    out_fhand1.flush()
    out_fhand2.flush()
Esempio n. 4
0
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format):
    '''It splits a sequence iterator with alternating paired reads in two.

    It will fail if forward and reverse reads are not alternating.
    '''
    for pair in group_pairs(seqs, n_seqs_in_pair=2):
        write_seqs((pair[0],), out_fhand1, out_format)
        write_seqs((pair[1],), out_fhand2, out_format)
    out_fhand1.flush()
    out_fhand2.flush()
Esempio n. 5
0
    def _pre_trim(self, trim_packet):
        seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs]
        reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming')

        write_seqs(seqs, reads_fhand)
        reads_fhand.flush()
        bwa = map_with_bwamem(self._index_fpath,
                              interleave_fpath=reads_fhand.name)
        bam_fhand = NamedTemporaryFile(dir=self._tempdir)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                                 tempdir=self._tempdir)

        self._bam_fhand = bam_fhand
        reads_fhand.close()
Esempio n. 6
0
def filter_duplicates(in_fhands, out_fhand, paired_reads, use_length=None,
                      n_seqs_packet=None, tempdir=None):
    if not in_fhands:
        raise ValueError('At least one input fhand is required')
    pairs = _read_pairs(in_fhands, paired_reads)
    get_pair_key = _PairKeyGetter(use_length=use_length)
    if n_seqs_packet is None:
        unique_pairs = unique_unordered(pairs, key=get_pair_key)
    else:
        sorted_pairs = sorted_items(pairs, key=get_pair_key, tempdir=tempdir,
                                    max_items_in_memory=n_seqs_packet)
        unique_pairs = unique(sorted_pairs, key=get_pair_key)
    for pair in unique_pairs:
        write_seqs(pair, out_fhand)
Esempio n. 7
0
    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        seq_fhand = write_seqs(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 13
        filters = [{'kind': 'min_length', 'min_num_residues': min_len,
                    'length_in_query': False, 'filter_match_parts': True},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                   'min_score': min_identity}]

        matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers,
                                        program='blastn', filters=filters,
                                        params={'task': 'blastn-short'},
                                        elongate_for_global=True,
                                        seqs_type=NUCL)
        new_seqs = []
        for seq in seqs:
            segments = matcher.get_matched_segments_for_read(get_name(seq))
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seq, segments)
            else:
                split_seqs = [seq]
            for seq in split_seqs:
                new_seqs.append(seq)
        return new_seqs
Esempio n. 8
0
    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        seq_fhand = write_seqs(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 13
        filters = [{
            'kind': 'min_length',
            'min_num_residues': min_len,
            'length_in_query': False,
            'filter_match_parts': True
        }, {
            'kind': 'score_threshold',
            'score_key': 'identity',
            'min_score': min_identity
        }]

        matcher = BlasterForFewSubjects(seq_fhand.name,
                                        self.linkers,
                                        program='blastn',
                                        filters=filters,
                                        params={'task': 'blastn-short'},
                                        elongate_for_global=True,
                                        seqs_type=NUCL)
        new_seqs = []
        for seq in seqs:
            segments = matcher.get_matched_segments_for_read(get_name(seq))
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seq, segments)
            else:
                split_seqs = [seq]
            for seq in split_seqs:
                new_seqs.append(seq)
        return new_seqs
Esempio n. 9
0
    def _pre_trim(self, trim_packet):
        seqs = [s for seqs in trim_packet[SEQS_PASSED] for s in seqs]
        reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming')

        write_seqs(seqs, reads_fhand)
        reads_fhand.flush()
        bwa = map_with_bwamem(self._index_fpath,
                              interleave_fpath=reads_fhand.name)
        bam_fhand = NamedTemporaryFile(dir=self._tempdir)
        map_process_to_sortedbam(bwa,
                                 bam_fhand.name,
                                 key='queryname',
                                 tempdir=self._tempdir)

        self._bam_fhand = bam_fhand
        reads_fhand.close()
Esempio n. 10
0
def _do_blast_2(db_fpath,
                queries,
                program,
                dbtype=None,
                blast_format=None,
                params=None,
                remote=False):
    '''It returns an alignment result with the blast.

    It is an alternative interface to the one based on fpaths.
    db_fpath should be a plain sequence file.
    queries should be a SeqRecord list.
    If an alternative blast output format is given it should be tabular, so
    blast_format is a list of fields.
    '''

    query_fhand = write_seqs(queries, file_format='fasta')
    query_fhand.flush()

    if remote:
        blastdb = db_fpath
        fmt = 'XML' if blast_format is None else blast_format.upper()
    else:
        blastdb = get_or_create_blastdb(db_fpath, dbtype=dbtype)
        if blast_format is None:
            blast_format = [
                'query',
                'subject',
                'query_length',
                'subject_length',
                'query_start',
                'query_end',
                'subject_start',
                'subject_end',
                'expect',
                'identity',
            ]
        fmt = generate_tabblast_format(blast_format)

    if params is None:
        params = {}
    params['outfmt'] = fmt

    blast_fhand = tempfile.NamedTemporaryFile(suffix='.blast')
    do_blast(query_fhand.name,
             blastdb,
             program,
             blast_fhand.name,
             params,
             remote=remote)
    if remote:
        blasts = BlastParser(blast_fhand)
    else:
        blasts = TabularBlastParser(blast_fhand, blast_format)

    return blasts, blast_fhand
Esempio n. 11
0
def filter_duplicates(in_fhands,
                      out_fhand,
                      paired_reads,
                      use_length=None,
                      n_seqs_packet=None,
                      tempdir=None):
    if not in_fhands:
        raise ValueError('At least one input fhand is required')
    pairs = _read_pairs(in_fhands, paired_reads)
    get_pair_key = _PairKeyGetter(use_length=use_length)
    if n_seqs_packet is None:
        unique_pairs = unique_unordered(pairs, key=get_pair_key)
    else:
        sorted_pairs = sorted_items(pairs,
                                    key=get_pair_key,
                                    tempdir=tempdir,
                                    max_items_in_memory=n_seqs_packet)
        unique_pairs = unique(sorted_pairs, key=get_pair_key)
    for pair in unique_pairs:
        write_seqs(pair, out_fhand)
Esempio n. 12
0
def _run_estscan(seqs, pep_out_fpath, dna_out_fpath, matrix_fpath):
    'It runs estscan in the input seqs'
    seq_fhand = write_seqs(seqs, file_format='fasta')
    seq_fhand.flush()
    binary = get_binary_path('estscan')

    cmd = [binary, '-t', pep_out_fpath, '-o', dna_out_fpath, '-M',
           matrix_fpath, seq_fhand.name]
    process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    check_process_finishes(process, binary=cmd[0])
    seq_fhand.close()
Esempio n. 13
0
    def test_seqitems_io(self):
        'It checks the different seq class streams IO'
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQITEM]))
        assert seqs[0].kind == SEQITEM
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
        assert seqs[0].object.name == 's1'

        # SeqRecord
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQRECORD]))
        assert seqs[0].kind == SEQRECORD
        fhand = StringIO()
        write_seqs(seqs, fhand, 'fasta')
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'

        # seqitem not possible with different input and output formats
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        try:
            seqs = list(read_seqs([fhand], out_format='fastq',
                        prefered_seq_classes=[SEQITEM]))
            self.fail('ValueError expected')
        except ValueError:
            pass

        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], out_format='fasta',
                              prefered_seq_classes=[SEQITEM]))
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
Esempio n. 14
0
    def _setup_checks(self, filterpacket):
        index_fpath = self._index_fpath
        seqs = [s for seqs in filterpacket[SEQS_PASSED] for s in seqs]
        seq_class = seqs[0].kind
        extra_params = []
        # Which format do we need for the bowtie2 input read file fasta or
        # fastq?
        if seq_class == SEQRECORD:
            if 'phred_quality' in seqs[0].object.letter_annotations.viewkeys():
                file_format = 'fastq'
            else:
                extra_params.append('-f')
                file_format = 'fasta'
        elif seq_class == SEQITEM:
            file_format = get_file_format(seqs[0])
            if 'illumina' in file_format:
                extra_params.append('--phred64')
            elif 'fasta' in file_format:
                extra_params.append('-f')
            elif 'fastq' in file_format:
                pass
            else:
                msg = 'For FilterBowtie2Match and SeqItems fastq or fasta '
                msg += 'files are required'
                raise RuntimeError(msg)
        else:
            raise NotImplementedError()

        reads_fhand = NamedTemporaryFile(suffix=file_format)
        write_seqs(seqs, reads_fhand, file_format=file_format)
        reads_fhand.flush()

        bam_fhand = NamedTemporaryFile(suffix='.bam')
        map_process = map_with_bowtie2(index_fpath,
                                       unpaired_fpath=reads_fhand.name,
                                       extra_params=extra_params,
                                       threads=self.threads)
        map_process_to_bam(map_process, bam_fhand.name)

        self.mapped_reads = _get_mapped_reads(bam_fhand.name, self.min_mapq)
Esempio n. 15
0
    def _setup_checks(self, filterpacket):
        index_fpath = self._index_fpath
        seqs = [s for seqs in filterpacket[SEQS_PASSED]for s in seqs]
        seq_class = seqs[0].kind
        extra_params = []
        # Which format do we need for the bowtie2 input read file fasta or
        # fastq?
        if seq_class == SEQRECORD:
            if 'phred_quality' in seqs[0].object.letter_annotations.viewkeys():
                file_format = 'fastq'
            else:
                extra_params.append('-f')
                file_format = 'fasta'
        elif seq_class == SEQITEM:
            file_format = get_file_format(seqs[0])
            if 'illumina' in file_format:
                extra_params.append('--phred64')
            elif 'fasta' in file_format:
                extra_params.append('-f')
            elif 'fastq' in file_format:
                pass
            else:
                msg = 'For FilterBowtie2Match and SeqItems fastq or fasta '
                msg += 'files are required'
                raise RuntimeError(msg)
        else:
            raise NotImplementedError()

        reads_fhand = NamedTemporaryFile(suffix=file_format)
        write_seqs(seqs, reads_fhand, file_format=file_format)
        reads_fhand.flush()

        bam_fhand = NamedTemporaryFile(suffix='.bam')
        map_process = map_with_bowtie2(index_fpath,
                                       unpaired_fpath=reads_fhand.name,
                                       extra_params=extra_params,
                                       threads=self.threads)
        map_process_to_bam(map_process, bam_fhand.name)

        self.mapped_reads = _get_mapped_reads(bam_fhand.name, self.min_mapq)
Esempio n. 16
0
def classify_chimeras(in_fhand,
                      index_fpath,
                      mate_distance,
                      out_fhand,
                      chimeras_fhand=None,
                      unknown_fhand=None,
                      tempdir=None,
                      threads=None,
                      settings=get_setting('CHIMERAS_SETTINGS')):
    '''It maps sequences from input files, sorts them and writes to output
    files according to its classification'''
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath,
                          interleave_fpath=in_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa,
                             bam_fhand.name,
                             key='queryname',
                             tempdir=tempdir)

    for pair, kind in classify_mapped_reads(bam_fhand,
                                            settings=settings,
                                            mate_distance=mate_distance):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
Esempio n. 17
0
def _run_estscan(seqs, pep_out_fpath, dna_out_fpath, matrix_fpath):
    'It runs estscan in the input seqs'
    seq_fhand = write_seqs(seqs, file_format='fasta')
    seq_fhand.flush()
    binary = get_binary_path('estscan')

    cmd = [
        binary, '-t', pep_out_fpath, '-o', dna_out_fpath, '-M', matrix_fpath,
        seq_fhand.name
    ]
    process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    check_process_finishes(process, binary=cmd[0])
    seq_fhand.close()
Esempio n. 18
0
 def _pre_trim(self, trim_packet):
     seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs]
     db_fhand = write_seqs(seqs, file_format='fasta')
     db_fhand.flush()
     params = {'task': 'blastn-short', 'expect': '0.0001'}
     filters = [{'kind': 'score_threshold', 'score_key': 'identity',
                 'min_score': 87},
                {'kind': 'min_length', 'min_num_residues': 13,
                 'length_in_query': False}]
     self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos,
                                          program='blastn', filters=filters,
                                          params=params,
                                          elongate_for_global=True)
Esempio n. 19
0
    def _setup_checks(self, filterpacket):
        seqs = [s for seqs in filterpacket[SEQS_PASSED]for s in seqs]

        # we create a blastdb for these reads and then we use the oligos
        # as the blast query
        db_fhand = write_seqs(seqs, file_format='fasta')
        db_fhand.flush()
        params = {'task': 'blastn-short', 'expect': '0.0001'}
        filters = [{'kind': 'score_threshold', 'score_key': 'identity',
                    'min_score': 87},
                   {'kind': 'min_length', 'min_num_residues': 13,
                    'length_in_query': False}]
        self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos,
                                             program='blastn', filters=filters,
                                             params=params,
                                             elongate_for_global=False)
Esempio n. 20
0
 def _pre_trim(self, trim_packet):
     seqs = [s for seqs in trim_packet[SEQS_PASSED] for s in seqs]
     db_fhand = write_seqs(seqs, file_format='fasta')
     db_fhand.flush()
     params = {'task': 'blastn-short', 'expect': '0.0001'}
     filters = [{
         'kind': 'score_threshold',
         'score_key': 'identity',
         'min_score': 87
     }, {
         'kind': 'min_length',
         'min_num_residues': 13,
         'length_in_query': False
     }]
     self._matcher = BlasterForFewSubjects(db_fhand.name,
                                           self.oligos,
                                           program='blastn',
                                           filters=filters,
                                           params=params,
                                           elongate_for_global=True)
Esempio n. 21
0
def _do_blast_2(db_fpath, queries, program, dbtype=None, blast_format=None,
                params=None, remote=False):
    '''It returns an alignment result with the blast.

    It is an alternative interface to the one based on fpaths.
    db_fpath should be a plain sequence file.
    queries should be a SeqRecord list.
    If an alternative blast output format is given it should be tabular, so
    blast_format is a list of fields.
    '''

    query_fhand = write_seqs(queries, file_format='fasta')
    query_fhand.flush()

    if remote:
        blastdb = db_fpath
        fmt = 'XML' if blast_format is None else blast_format.upper()
    else:
        blastdb = get_or_create_blastdb(db_fpath, dbtype=dbtype)
        if blast_format is None:
            blast_format = ['query', 'subject', 'query_length', 'subject_length',
                            'query_start', 'query_end', 'subject_start',
                            'subject_end', 'expect', 'identity', ]
        fmt = generate_tabblast_format(blast_format)

    if params is None:
        params = {}
    params['outfmt'] = fmt

    blast_fhand = tempfile.NamedTemporaryFile(suffix='.blast')
    do_blast(query_fhand.name, blastdb, program, blast_fhand.name, params,
             remote=remote)
    if remote:
        blasts = BlastParser(blast_fhand)
    else:
        blasts = TabularBlastParser(blast_fhand, blast_format)

    return blasts, blast_fhand
Esempio n. 22
0
    def _setup_checks(self, filterpacket):
        seqs = [s for seqs in filterpacket[SEQS_PASSED] for s in seqs]

        # we create a blastdb for these reads and then we use the oligos
        # as the blast query
        db_fhand = write_seqs(seqs, file_format='fasta')
        db_fhand.flush()
        params = {'task': 'blastn-short', 'expect': '0.0001'}
        filters = [{
            'kind': 'score_threshold',
            'score_key': 'identity',
            'min_score': 87
        }, {
            'kind': 'min_length',
            'min_num_residues': 13,
            'length_in_query': False
        }]
        self._matcher = BlasterForFewSubjects(db_fhand.name,
                                              self.oligos,
                                              program='blastn',
                                              filters=filters,
                                              params=params,
                                              elongate_for_global=False)
Esempio n. 23
0
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand,
                      chimeras_fhand=None, unknown_fhand=None, tempdir=None,
                      threads=None, settings=get_setting('CHIMERAS_SETTINGS')):

    '''It maps sequences from input files, sorts them and writes to output
    files according to its classification'''
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                             tempdir=tempdir)

    for pair, kind in classify_mapped_reads(bam_fhand, settings=settings,
                                            mate_distance=mate_distance):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
Esempio n. 24
0
    def test_seqitems_io(self):
        'It checks the different seq class streams IO'
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQITEM]))
        assert seqs[0].kind == SEQITEM
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
        assert seqs[0].object.name == 's1'

        # SeqRecord
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQRECORD]))
        assert seqs[0].kind == SEQRECORD
        fhand = StringIO()
        write_seqs(seqs, fhand, 'fasta')
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'

        # seqitem not possible with different input and output formats
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        try:
            seqs = list(
                read_seqs([fhand],
                          out_format='fastq',
                          prefered_seq_classes=[SEQITEM]))
            self.fail('ValueError expected')
        except ValueError:
            pass

        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(
            read_seqs([fhand],
                      out_format='fasta',
                      prefered_seq_classes=[SEQITEM]))
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'