def benchmark_biopython_adapter(fh): total_seq = int(0) t0 = time.time() from fastqandfurious import fastqandfurious from fastqandfurious._fastqandfurious import arrayadd_b from Bio.SeqRecord import SeqRecord from array import array def biopython_entryfunc(buf, posarray, globaloffset): name = buf[posarray[0]:posarray[1]].decode('ascii') quality = array('b') quality.frombytes(buf[posarray[4]:posarray[5]]) arrayadd_b(quality, -33) entry = SeqRecord(seq=buf[posarray[2]:posarray[3]].decode('ascii'), id=name, name=name, letter_annotations={'phred_quality': quality}) return entry bufsize = 20000 it = fastqandfurious.readfastq_iter(fh, bufsize, biopython_entryfunc) for i, e in enumerate(it): total_seq += len(e.seq) if i % REFRESH_RATE == 0: t1 = time.time() print('\r%.2fMB/s' % (total_seq/(1E6)/(t1-t0)), end='', flush=True) print() print('%i entries in %.3f seconds.' % (i+1, time.time()-t0))
def _fastqandfurious_c_iter(fn, mode, buffering, bufsize): from fastqandfurious import fastqandfurious, _fastqandfurious openfunc = _opener(fn) with open(fn, mode, buffering = buffering) as f: with openfunc(f) as fh: it = fastqandfurious.readfastq_iter(fh, bufsize, _entrypos=_fastqandfurious.entrypos) for i, (header, sequence, quality) in enumerate(it): yield (i, header, sequence)
def _test_readfastq_iter(filename, bufsize, entrypos): with open(filename, 'rb') as fh, open(filename, 'rt') as fh_bp: for entry, entry_bp in zip( fastqandfurious.readfastq_iter(fh, bufsize, _entrypos=entrypos), SeqIO.parse(fh_bp, "fastq")): header, sequence, quality = entry assert header == entry_bp.description.encode('ascii') assert sequence == str(entry_bp.seq).encode('ascii')
def _fastqandfurious_iter(fn, mode, buffering): from fastqandfurious import fastqandfurious bufsize = int(5E4) openfunc = _opener(fn) with open(fn, mode, buffering=buffering) as f: with openfunc(f) as fh: it = fastqandfurious.readfastq_iter(fh, bufsize) for i, e in enumerate(it): yield (i, e.header, e.sequence)
def benchmark_faf(fh, bufsize: int = int(2**16)): from fastqandfurious import fastqandfurious total_seq = int(0) t0 = time.time() it = fastqandfurious.readfastq_iter(fh, bufsize) for i, e in enumerate(it): total_seq += len(e[1]) if i % REFRESH_RATE == 0: t1 = time.time() print('\r%.2fMB/s' % (total_seq/(1E6)/(t1-t0)), end='', flush=True) print() print('%i entries in %.3f seconds.' % (i+1, time.time()-t0))
def _test_readfastq_iter(filename, fixmultiline, func, bufsize): with open(filename, 'rb') as fh, open(filename, 'rt') as fh_bp: entry_iter = zip( fastqandfurious.readfastq_iter(fh, bufsize, _entrypos=entrypos), SeqIO.parse(fh_bp, "fastq")) for entry, entry_bp in entry_iter: header, sequence, quality = entry assert header == entry_bp.description.encode('ascii') if fixmultiline: assert (sequence.replace(b'\n', b'') == str( entry_bp.seq).encode('ascii')) else: assert sequence == str(entry_bp.seq).encode('ascii')
def _test_readfastq_abspos(filename, bufsize, entrypos): with open(filename, 'rb') as fh, \ open(filename, 'rt') as fh_bp, \ open(filename, 'rb') as fh2: data = fh2.read() for i, (posarray, entry_bp) in enumerate( zip( fastqandfurious.readfastq_iter( fh, bufsize, entryfunc=fastqandfurious.entryfunc_abspos, _entrypos=entrypos), SeqIO.parse(fh_bp, "fastq"))): header = data[(posarray[0] + 1):posarray[1]] sequence = data[posarray[2]:posarray[3]] assert header == entry_bp.description.encode('ascii') assert sequence == str(entry_bp.seq).encode('ascii')
def change_readnames(data): """ Adds "\1" or "\2" to paired end read names in fastq file generated by Illumina Hiseq. This format is needed for input for Trans-ABySS""" if data.endswith('gz'): input_file = gzip.open(data) else: input_file = open(data) bufsize = 20000 count = 0 it = fastqandfurious.readfastq_iter(input_file, bufsize, fastqandfurious.entryfunc) for entry in it: print(type(it)) count += 1 print(count, "reads in", data)
def run_speed(args): print('Running benchmark on file %s' % args.filename) if not args.no_screed: print('---') print('screed:') try: benchmark_screed(args.filename) except Exception as e: print('Error: %s' % str(e)) lst = list() if not args.no_biopython: lst.append(('biopython', benchmark_biopython, 'rt')) lst.append(('biopython_fastqiterator', benchmark_biopython_faster, 'rt')) if args.with_biopython_adapter: lst.append(('biopython_adapter', benchmark_biopython_adapter, 'rb')) if not args.no_ngs_plumbing: lst.append(('ngs_plumbing', benchmark_ngsplumbing, 'rb')) if not args.no_fastqandfurious_python: lst.append(('fastqandfurious', benchmark_faf, 'rb')) lst.append(('fastqandfurious (w/ C-ext)', benchmark_faf_c, 'rb')) lst.append(('fastqandfurious (w/ C-ext and indexing)', benchmark_faf_c_index, 'rb')) for name, func, mode in lst: print('---') print(name) openfunc = _opener(args.filename) if name in ('biopython', 'biopython_fastqiterator'): with openfunc(args.filename, mode=mode) as fh: try: func(fh) except Exception as e: print('Error: %s' % str(e)) elif name == 'fastqandfurious (w/ C-ext and indexing)': import tempfile from fastqandfurious import fastqandfurious, _fastqandfurious bufsize = args.faf_buffersize with tempfile.NamedTemporaryFile(mode='r+b') as fh_index: with openfunc(args.filename, mode=mode) as fh: print(' building index...', end='', flush=True) it = fastqandfurious.readfastq_iter(fh, bufsize, entryfunc=fastqandfurious.entryfunc_abspos, _entrypos=_fastqandfurious.entrypos) for i, pos in enumerate(it): pos.tofile(fh_index) fh_index.flush() fh_index.seek(0) print('done.') with openfunc(args.filename, mode=mode) as fh: #try: func(fh, fh_index) #except Exception as e: # print('Error: %s' % str(e)) else: with open(args.filename, mode, buffering = args.io_buffersize) as f: with openfunc(f) as fh: #try: if True: func(fh)
import ngs_plumbing.fastq except ImportError as ie: print(ie) sys.exit(1) if args.format == 'FASTQ': parser = ngs_plumbing.fastq.read_fastq elif args.format == 'FASTA': parser = ngs_plumbing.fastq.read_fasta elif args.parser == 'fastqandfurious': try: from fastqandfurious import fastqandfurious, _fastqandfurious except ImportError as ie: print(ie) sys.exit(1) if args.format == 'FASTQ': parser = lambda fh: fastqandfurious.readfastq_iter( fh, 20000, _entrypos=_fastqandfurious.entrypos) elif args.format == 'FASTA': print('Error: no FASTA parser with fastqandfurious') sys.exit(1) cls = MinSketch seed = 42 hashfun = mashingpumpkins.sourmash.mash_hashfun if len(args.filename) == 0: print('Nothing to do, so nothing was done. Try --help.') sys.exit(0) elif not args.aggregate: # empty sketch (will be updated as we process files) total_mhs = cls(args.ksize, args.maxsize, hashfun, seed) for fn in args.filename:
def main(): args = parser.parse_args() matcher_name = args.matcher adapter = args.adapter trimFirst = args.trim_first trimLast = args.trim_last trimTo = args.trim_to inFilePath = args.in_file outFilePath = args.out_file maxThread = args.workers chunk = args.chunk debugLimit = args.debug_limit print() if trimFirst == 0: print(f"Not trimming any initial bases") else: print(f"Trimming the first {trimFirst} bases") print(f"Trimming adapter: {adapter}") # if version == 2: print(f"The matcher '{matcher_name}' is used to find the adapter") # print(f'Considering only first {matchOnly} bases of adapter: {adapter[:matchOnly]}') # if version < 3: # print(f'Considering {len(adapters)} possible variants of the adapter') # else: # print(f'Using Levenshtein-Damerau distance to find adapter variants') print(f"Trimming all bases after the adapter (if present)") if trimLast == 0: print(f"Not trimming any other bases after adapter removal") else: print(f"Trimming the last {trimLast} bases after adapter removal") print(f"Saving to file: {outFilePath}") print("Used", f"{maxThread} workers" if maxThread > 0 else "sequential version") print() # get the matcher function matcher_builder = MATCHER_BUILDER[matcher_name] matcher = matcher_builder(adapter, args) if maxThread > 0: # build the parallel topology process = [None] * maxThread queues1 = [None] * maxThread queues2 = [None] * maxThread for i in range(maxThread): queues1[i] = Queue() queues2[i] = Queue() out_queue = queues2[i] process[i] = Process( target=worker_fun, args=(queues1[i], out_queue, trimFirst, trimLast, trimTo, matcher), ) process[i].start() collector = Process(target=collector_fun, args=(outFilePath, queues2)) collector.start() # start file read t_start = time.perf_counter() * 1000 with open(inFilePath, "r+b") as infile: t = 0 sequence = ff.readfastq_iter(infile, fbufsize=50000, _entrypos=entrypos_c) for i, seq in enumerate(sequence): p = i % chunk if p == 0: partition = [None] * chunk if i == debugLimit: break partition[p] = seq if p == chunk - 1: queues1[t].put(partition) t = (t + 1) % maxThread if p < chunk - 1: partition = partition[:p] queues1[t].put(partition) print(f"Sent {i} elements to the workers") for q in queues1: q.put(EOF) print("Wait process") for p in process: p.join() collector.join() t_end = time.perf_counter() * 1000 time_match = math.floor(t_end - t_start) print(f"Matching time: {time_match}") else: # Sequential version t_start = time.perf_counter() * 1000 with open(inFilePath, "r+b") as infile: sequence = ff.readfastq_iter(infile, fbufsize=50000, _entrypos=entrypos_c) with open(outFilePath, "w") as outFile: for i, seq in enumerate(sequence): comment = seq[0].decode("utf-8") line = seq[1].decode("utf-8") quality = seq[2].decode("utf-8") match = matcher(line) tFirst = 0 tLast = 0 count = 0 if trimTo and match: lineLen = len(line[:match]) while lineLen > trimTo: if count % 2: tFirst += 1 else: tLast += 1 count += 1 lineLen -= 1 tFirst = max(tFirst, trimFirst) tLast = max(tLast, trimLast) if match: line = line[tFirst:match - tLast] quality = quality[tFirst:match - tLast] else: line = line[tFirst:tFirst + trimTo] quality = quality[tFirst:tFirst + trimTo] outFile.write(f"@{comment}\n{line}\n+\n{quality}\n") t_end = time.perf_counter() * 1000 time_match = math.floor(t_end - t_start) print(f"Processed {i} elements") print(f"Matching time: {time_match}") # Align results if args.aligner: print("Start alignment") # Align results if args.aligner in ("bowtie", "bowtie_htseq"): bowtie(args) if args.aligner == "bowtie_htseq": htseq(args)