Example #1
0
def benchmark_biopython_adapter(fh):
    total_seq = int(0)
    t0 = time.time()

    from fastqandfurious import fastqandfurious
    from fastqandfurious._fastqandfurious import arrayadd_b
    from Bio.SeqRecord import SeqRecord
    from array import array

    def biopython_entryfunc(buf, posarray, globaloffset):
        name = buf[posarray[0]:posarray[1]].decode('ascii')
        quality = array('b')
        quality.frombytes(buf[posarray[4]:posarray[5]])
        arrayadd_b(quality, -33)
        entry = SeqRecord(seq=buf[posarray[2]:posarray[3]].decode('ascii'),
                          id=name,
                          name=name,
                          letter_annotations={'phred_quality': quality})
        return entry

    bufsize = 20000
    it = fastqandfurious.readfastq_iter(fh, bufsize, biopython_entryfunc)
    for i, e in enumerate(it):
        total_seq += len(e.seq)
        if i % REFRESH_RATE == 0:
            t1 = time.time()
            print('\r%.2fMB/s' % (total_seq/(1E6)/(t1-t0)), end='', flush=True)
    print()
    print('%i entries in %.3f seconds.' % (i+1, time.time()-t0))
Example #2
0
def _fastqandfurious_c_iter(fn, mode, buffering, bufsize):
    from fastqandfurious import fastqandfurious, _fastqandfurious
    openfunc = _opener(fn)
    with open(fn, mode, buffering = buffering) as f:
        with openfunc(f) as fh: 
            it = fastqandfurious.readfastq_iter(fh, bufsize,
                                                _entrypos=_fastqandfurious.entrypos)
            for i, (header, sequence, quality) in enumerate(it):
                yield (i, header, sequence)
Example #3
0
def _test_readfastq_iter(filename, bufsize, entrypos):
    with open(filename, 'rb') as fh, open(filename, 'rt') as fh_bp:
        for entry, entry_bp in zip(
                fastqandfurious.readfastq_iter(fh, bufsize,
                                               _entrypos=entrypos),
                SeqIO.parse(fh_bp, "fastq")):
            header, sequence, quality = entry
            assert header == entry_bp.description.encode('ascii')
            assert sequence == str(entry_bp.seq).encode('ascii')
Example #4
0
def _fastqandfurious_iter(fn, mode, buffering):
    from fastqandfurious import fastqandfurious
    bufsize = int(5E4)
    openfunc = _opener(fn)
    with open(fn, mode, buffering=buffering) as f:
        with openfunc(f) as fh:
            it = fastqandfurious.readfastq_iter(fh, bufsize)
            for i, e in enumerate(it):
                yield (i, e.header, e.sequence)
Example #5
0
def benchmark_faf(fh, bufsize: int = int(2**16)):
    from fastqandfurious import fastqandfurious
    total_seq = int(0)
    t0 = time.time()
    it = fastqandfurious.readfastq_iter(fh, bufsize)
    for i, e in enumerate(it):
        total_seq += len(e[1])
        if i % REFRESH_RATE == 0:
            t1 = time.time()
            print('\r%.2fMB/s' % (total_seq/(1E6)/(t1-t0)), end='', flush=True)
    print()
    print('%i entries in %.3f seconds.' % (i+1, time.time()-t0))
Example #6
0
def _test_readfastq_iter(filename, fixmultiline, func, bufsize):
    with open(filename, 'rb') as fh, open(filename, 'rt') as fh_bp:
        entry_iter = zip(
            fastqandfurious.readfastq_iter(fh, bufsize, _entrypos=entrypos),
            SeqIO.parse(fh_bp, "fastq"))
        for entry, entry_bp in entry_iter:
            header, sequence, quality = entry
            assert header == entry_bp.description.encode('ascii')

            if fixmultiline:
                assert (sequence.replace(b'\n', b'') == str(
                    entry_bp.seq).encode('ascii'))
            else:
                assert sequence == str(entry_bp.seq).encode('ascii')
Example #7
0
def _test_readfastq_abspos(filename, bufsize, entrypos):
    with open(filename, 'rb') as fh, \
         open(filename, 'rt') as fh_bp, \
         open(filename, 'rb') as fh2:
        data = fh2.read()
        for i, (posarray, entry_bp) in enumerate(
                zip(
                    fastqandfurious.readfastq_iter(
                        fh,
                        bufsize,
                        entryfunc=fastqandfurious.entryfunc_abspos,
                        _entrypos=entrypos), SeqIO.parse(fh_bp, "fastq"))):
            header = data[(posarray[0] + 1):posarray[1]]
            sequence = data[posarray[2]:posarray[3]]
            assert header == entry_bp.description.encode('ascii')
            assert sequence == str(entry_bp.seq).encode('ascii')
Example #8
0
def change_readnames(data):
    """ Adds "\1" or "\2" to paired end read names in fastq file generated by
    Illumina Hiseq. This format is needed for input for Trans-ABySS"""

    if data.endswith('gz'):
        input_file = gzip.open(data)
    else:
        input_file = open(data)

    bufsize = 20000
    count = 0
    it = fastqandfurious.readfastq_iter(input_file, bufsize,
                                        fastqandfurious.entryfunc)
    for entry in it:
        print(type(it))
        count += 1
    print(count, "reads in", data)
Example #9
0
def run_speed(args):

    print('Running benchmark on file %s' % args.filename)

    if not args.no_screed:
        print('---')
        print('screed:')
        try:
            benchmark_screed(args.filename)
        except Exception as e:
            print('Error: %s' % str(e))

    lst = list()
    if not args.no_biopython:
        lst.append(('biopython', benchmark_biopython, 'rt'))
        lst.append(('biopython_fastqiterator', benchmark_biopython_faster, 'rt'))
        if args.with_biopython_adapter:
            lst.append(('biopython_adapter', benchmark_biopython_adapter, 'rb'))
    if not args.no_ngs_plumbing:
        lst.append(('ngs_plumbing', benchmark_ngsplumbing, 'rb'))
    if not args.no_fastqandfurious_python:
        lst.append(('fastqandfurious', benchmark_faf, 'rb'))
    lst.append(('fastqandfurious (w/ C-ext)', benchmark_faf_c, 'rb'))
    lst.append(('fastqandfurious (w/ C-ext and indexing)', benchmark_faf_c_index, 'rb'))

    for name, func, mode in lst:
        print('---')
        print(name)
        openfunc = _opener(args.filename)
        if name in ('biopython', 'biopython_fastqiterator'):
            with openfunc(args.filename, mode=mode) as fh:
                try:
                    func(fh)
                except Exception as e:
                    print('Error: %s' % str(e))
        elif name == 'fastqandfurious (w/ C-ext and indexing)':
            import tempfile
            from fastqandfurious import fastqandfurious, _fastqandfurious
            bufsize = args.faf_buffersize
            with tempfile.NamedTemporaryFile(mode='r+b') as fh_index:
                with openfunc(args.filename, mode=mode) as fh:
                    print('  building index...', end='', flush=True)
                    it = fastqandfurious.readfastq_iter(fh, bufsize,
                                                        entryfunc=fastqandfurious.entryfunc_abspos,
                                                        _entrypos=_fastqandfurious.entrypos)
                    for i, pos in enumerate(it):
                        pos.tofile(fh_index)
                    fh_index.flush()
                    fh_index.seek(0)
                    print('done.')
                with openfunc(args.filename, mode=mode) as fh:
                    #try:
                    func(fh, fh_index)
                    #except Exception as e:
                    #    print('Error: %s' % str(e))
        else:
            with open(args.filename, mode, buffering = args.io_buffersize) as f:
                with openfunc(f) as fh:
                    #try:
                    if True:
                        func(fh)
Example #10
0
            import ngs_plumbing.fastq
        except ImportError as ie:
            print(ie)
            sys.exit(1)
        if args.format == 'FASTQ':
            parser = ngs_plumbing.fastq.read_fastq
        elif args.format == 'FASTA':
            parser = ngs_plumbing.fastq.read_fasta
    elif args.parser == 'fastqandfurious':
        try:
            from fastqandfurious import fastqandfurious, _fastqandfurious
        except ImportError as ie:
            print(ie)
            sys.exit(1)
        if args.format == 'FASTQ':
            parser = lambda fh: fastqandfurious.readfastq_iter(
                fh, 20000, _entrypos=_fastqandfurious.entrypos)
        elif args.format == 'FASTA':
            print('Error: no FASTA parser with fastqandfurious')
            sys.exit(1)
    cls = MinSketch
    seed = 42
    hashfun = mashingpumpkins.sourmash.mash_hashfun

    if len(args.filename) == 0:
        print('Nothing to do, so nothing was done. Try --help.')
        sys.exit(0)
    elif not args.aggregate:
        # empty sketch (will be updated as we process files)
        total_mhs = cls(args.ksize, args.maxsize, hashfun, seed)

    for fn in args.filename:
Example #11
0
def main():
    args = parser.parse_args()
    matcher_name = args.matcher
    adapter = args.adapter
    trimFirst = args.trim_first
    trimLast = args.trim_last
    trimTo = args.trim_to
    inFilePath = args.in_file
    outFilePath = args.out_file
    maxThread = args.workers
    chunk = args.chunk
    debugLimit = args.debug_limit

    print()
    if trimFirst == 0:
        print(f"Not trimming any initial bases")
    else:
        print(f"Trimming the first {trimFirst} bases")
    print(f"Trimming adapter: {adapter}")
    # if version == 2:
    print(f"The matcher '{matcher_name}' is used to find the adapter")
    # print(f'Considering only first {matchOnly} bases of adapter: {adapter[:matchOnly]}')
    # if version < 3:
    #     print(f'Considering {len(adapters)} possible variants of the adapter')
    # else:
    #     print(f'Using Levenshtein-Damerau distance to find adapter variants')
    print(f"Trimming all bases after the adapter (if present)")
    if trimLast == 0:
        print(f"Not trimming any other bases after adapter removal")
    else:
        print(f"Trimming the last {trimLast} bases after adapter removal")
    print(f"Saving to file: {outFilePath}")
    print("Used",
          f"{maxThread} workers" if maxThread > 0 else "sequential version")
    print()

    # get the matcher function
    matcher_builder = MATCHER_BUILDER[matcher_name]
    matcher = matcher_builder(adapter, args)

    if maxThread > 0:
        # build the parallel topology
        process = [None] * maxThread
        queues1 = [None] * maxThread
        queues2 = [None] * maxThread
        for i in range(maxThread):
            queues1[i] = Queue()
            queues2[i] = Queue()
            out_queue = queues2[i]
            process[i] = Process(
                target=worker_fun,
                args=(queues1[i], out_queue, trimFirst, trimLast, trimTo,
                      matcher),
            )
            process[i].start()
        collector = Process(target=collector_fun, args=(outFilePath, queues2))
        collector.start()

        # start file read
        t_start = time.perf_counter() * 1000
        with open(inFilePath, "r+b") as infile:
            t = 0
            sequence = ff.readfastq_iter(infile,
                                         fbufsize=50000,
                                         _entrypos=entrypos_c)
            for i, seq in enumerate(sequence):
                p = i % chunk
                if p == 0:
                    partition = [None] * chunk

                if i == debugLimit:
                    break

                partition[p] = seq

                if p == chunk - 1:
                    queues1[t].put(partition)
                    t = (t + 1) % maxThread
            if p < chunk - 1:
                partition = partition[:p]
                queues1[t].put(partition)

        print(f"Sent {i} elements to the workers")
        for q in queues1:
            q.put(EOF)

        print("Wait process")
        for p in process:
            p.join()
        collector.join()
        t_end = time.perf_counter() * 1000
        time_match = math.floor(t_end - t_start)

        print(f"Matching time: {time_match}")
    else:
        # Sequential version
        t_start = time.perf_counter() * 1000
        with open(inFilePath, "r+b") as infile:
            sequence = ff.readfastq_iter(infile,
                                         fbufsize=50000,
                                         _entrypos=entrypos_c)
            with open(outFilePath, "w") as outFile:
                for i, seq in enumerate(sequence):
                    comment = seq[0].decode("utf-8")
                    line = seq[1].decode("utf-8")
                    quality = seq[2].decode("utf-8")
                    match = matcher(line)
                    tFirst = 0
                    tLast = 0
                    count = 0

                    if trimTo and match:
                        lineLen = len(line[:match])
                        while lineLen > trimTo:
                            if count % 2:
                                tFirst += 1
                            else:
                                tLast += 1
                            count += 1
                            lineLen -= 1

                    tFirst = max(tFirst, trimFirst)
                    tLast = max(tLast, trimLast)

                    if match:
                        line = line[tFirst:match - tLast]
                        quality = quality[tFirst:match - tLast]
                    else:
                        line = line[tFirst:tFirst + trimTo]
                        quality = quality[tFirst:tFirst + trimTo]

                    outFile.write(f"@{comment}\n{line}\n+\n{quality}\n")

        t_end = time.perf_counter() * 1000
        time_match = math.floor(t_end - t_start)
        print(f"Processed {i} elements")
        print(f"Matching time: {time_match}")

    # Align results
    if args.aligner:
        print("Start alignment")

    # Align results
    if args.aligner in ("bowtie", "bowtie_htseq"):
        bowtie(args)

    if args.aligner == "bowtie_htseq":
        htseq(args)