def basecall(model, reads, chunksize=4000, overlap=500, batchsize=32, reverse=False): reads = (read_chunk for read in reads for read_chunk in split_read(read, chunksize * batchsize)[::-1 if reverse else 1]) chunks = (((read, start, end), chunk(torch.from_numpy(read.signal[start:end]), chunksize, overlap)) for (read, start, end) in reads) batches = ((k, compute_scores(model, batch, reverse=reverse)) for k, batch in batchify(chunks, batchsize=batchsize)) stitched = ((read, stitch(x, chunksize, overlap, end - start, model.stride, reverse=reverse)) for ((read, start, end), x) in unbatchify(batches)) transferred = thread_map(transfer, stitched, n_thread=1) return ((read, concat([part for k, part in parts])) for read, parts in groupby(transferred, lambda x: x[0]))
def basecall(model, reads, chunksize=4000, overlap=100, batchsize=32, reverse=False): """ Basecalls a set of reads. """ chunks = thread_iter( ((read, 0, len(read.signal)), chunk(torch.from_numpy(read.signal), chunksize, overlap)) for read in reads) batches = thread_iter(batchify(chunks, batchsize=batchsize)) scores = thread_iter((read, compute_scores(model, batch, reverse=reverse)) for read, batch in batches) results = thread_iter( (read, stitch_results(scores, end - start, chunksize, overlap, model.stride, reverse)) for ((read, start, end), scores) in unbatchify(scores)) return thread_iter( (read, apply_stride_to_moves(model, attrs)) for read, attrs in results)
def main(args): sys.stderr.write("> loading model\n") model = load_model( args.model_directory, args.device, weights=int(args.weights), half=args.half, chunksize=args.chunksize, use_rt=args.cudart, ) samples = 0 num_reads = 0 max_read_size = 4e6 dtype = np.float16 if args.half else np.float32 reader = PreprocessReader(args.reads_directory) writer = DecoderWriterPool(model, beamsize=args.beamsize, fastq=args.fastq, reference=args.reference) t0 = time.perf_counter() sys.stderr.write("> calling\n") with writer, reader, torch.no_grad(): while True: read = reader.queue.get() if read is None: break if len(read.signal) > max_read_size: sys.stderr.write("> skipping long read %s (%s samples)\n" % (read.read_id, len(read.signal))) continue num_reads += 1 samples += len(read.signal) raw_data = torch.tensor(read.signal.astype(dtype)) chunks = chunk(raw_data, args.chunksize, args.overlap) posteriors = model(chunks.to(args.device)).cpu().numpy() posteriors = stitch(posteriors, args.overlap // model.stride // 2) writer.queue.put((read, posteriors[:raw_data.shape[0]])) duration = time.perf_counter() - t0 sys.stderr.write("> completed reads: %s\n" % num_reads) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (samples / duration)) sys.stderr.write("> done\n")
def basecall(model, reads, beamsize=5, chunksize=0, overlap=0, batchsize=1, qscores=False, reverse=None): """ Basecalls a set of reads. """ chunks = ( (read, chunk(torch.tensor(read.signal), chunksize, overlap)) for read in reads ) scores = unbatchify( (k, compute_scores(model, v)) for k, v in batchify(chunks, batchsize) ) scores = ( (read, {'scores': stitch(v, chunksize, overlap, len(read.signal), model.stride)}) for read, v in scores ) decoder = partial(decode, decode=model.decode, beamsize=beamsize, qscores=qscores, stride=model.stride) basecalls = process_map(decoder, scores, n_proc=4) return basecalls
def basecall(model, reads, aligner=None, beamsize=40, chunksize=4000, overlap=500, batchsize=32, qscores=False, reverse=False): """ Basecalls a set of reads. """ _decode = partial(decode_int8, seqdist=model.seqdist, beamsize=beamsize) reads = (read_chunk for read in reads for read_chunk in split_read(read)[::-1 if reverse else 1]) chunks = (((read, start, end), chunk(torch.from_numpy(read.signal[start:end]), chunksize, overlap)) for (read, start, end) in reads) batches = ( (k, quantise_int8(compute_scores(model, batch, reverse=reverse))) for k, batch in thread_iter(batchify(chunks, batchsize=batchsize))) stitched = ((read, stitch(x, chunksize, overlap, end - start, model.stride, reverse=reverse)) for ((read, start, end), x) in unbatchify(batches)) transferred = thread_map(transfer, stitched, n_thread=1) basecalls = thread_map(_decode, transferred, n_thread=8) basecalls = ((read, ''.join(seq for k, seq in parts)) for read, parts in groupby( basecalls, lambda x: (x[0].parent if hasattr(x[0], 'parent') else x[0]))) basecalls = ((read, { 'sequence': seq, 'qstring': '?' * len(seq) if qscores else '*', 'mean_qscore': 0.0 }) for read, seq in basecalls) if aligner: return align_map(aligner, basecalls) return basecalls
def basecall(model, reads, aligner=None, beamsize=40, chunksize=4000, overlap=500, batchsize=32, qscores=False): """ Basecalls a set of reads. """ split_read_length=400000 _stitch = partial( stitch, start=overlap // 2 // model.stride, end=(chunksize - overlap // 2) // model.stride, ) _decode = partial(decode_int8, seqdist=model.seqdist, beamsize=beamsize) reads = ( ((read, i), x) for read in reads for (i, x) in enumerate(torch.split(torch.from_numpy(read.signal), split_read_length)) ) chunks = ( ((read, chunk(signal, chunksize, overlap, pad_start=True)) for (read, signal) in reads) ) batches = ( (read, quantise_int8(compute_scores(model, batch))) for read, batch in thread_iter(batchify(chunks, batchsize=batchsize)) ) stitched = ((read, _stitch(x)) for (read, x) in unbatchify(batches)) transferred = thread_map(transfer, stitched, n_thread=1) basecalls = thread_map(_decode, transferred, n_thread=8) basecalls = ( (read, ''.join(seq for k, seq in parts)) for read, parts in groupby(basecalls, lambda x: x[0][0]) ) basecalls = ( (read, {'sequence': seq, 'qstring': '?' * len(seq) if qscores else '*', 'mean_qscore': 0.0}) for read, seq in basecalls ) if aligner: return align_map(aligner, basecalls) return basecalls
def main(args): if args.save_ctc and not args.reference: sys.stderr.write("> a reference is needed to output ctc training data\n") exit(1) if args.save_ctc: args.overlap = 900 args.chunksize = 3600 sys.stderr.write("> loading model\n") model = load_model( args.model_directory, args.device, weights=int(args.weights), half=args.half, chunksize=args.chunksize, use_rt=args.cudart, ) if args.reference: sys.stderr.write("> loading reference\n") aligner = Aligner(args.reference, preset='ont-map') if not aligner: sys.stderr.write("> failed to load/build index\n") sys.exit(1) else: aligner = None samples = 0 num_reads = 0 max_read_size = 4e6 dtype = np.float16 if args.half else np.float32 ctc_writer = CTCWriter(model, aligner) reader = PreprocessReader(args.reads_directory) writer = DecoderWriterPool(model, beamsize=args.beamsize, fastq=args.fastq, aligner=aligner) t0 = time.perf_counter() sys.stderr.write("> calling\n") with writer, ctc_writer, reader, torch.no_grad(): while True: read = reader.queue.get() if read is None: break if len(read.signal) > max_read_size: sys.stderr.write("> skipping long read %s (%s samples)\n" % (read.read_id, len(read.signal))) continue num_reads += 1 samples += len(read.signal) raw_data = torch.tensor(read.signal.astype(dtype)) chunks = chunk(raw_data, args.chunksize, args.overlap) posteriors_ = model(chunks.to(args.device)).cpu().numpy() posteriors = stitch(posteriors_, args.overlap // model.stride // 2) writer.queue.put((read, posteriors[:raw_data.shape[0]])) if args.save_ctc and len(raw_data) > args.chunksize: ctc_writer.queue.put((chunks.numpy(), posteriors_)) duration = time.perf_counter() - t0 sys.stderr.write("> completed reads: %s\n" % num_reads) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (samples / duration)) sys.stderr.write("> done\n")
def main(args): if args.save_ctc and not args.reference: sys.stderr.write( "> a reference is needed to output ctc training data\n") exit(1) if args.save_ctc: args.overlap = 900 args.chunksize = 3600 sys.stderr.write("> loading model\n") model = load_model( args.model_directory, args.device, weights=int(args.weights), half=args.half, chunksize=args.chunksize, use_rt=args.cudart, ) if args.reference: sys.stderr.write("> loading reference\n") aligner = Aligner(args.reference, preset='ont-map') if not aligner: sys.stderr.write("> failed to load/build index\n") sys.exit(1) write_sam_header(aligner) else: aligner = None # with open(summary_file(), 'w') as summary: # write_summary_header(summary, alignment=aligner) samples = 0 num_reads = 0 max_read_size = 4e6 read_ids = column_to_set(args.read_ids) dtype = np.float16 if args.half else np.float32 reader = ProcessIterator(get_reads(args.reads_directory, read_ids=read_ids, skip=args.skip), progress=True) writer = ProcessPool(DecoderWriter, model=model, aligner=aligner, beamsize=args.beamsize, fastq=args.fastq) ctc_writer = CTCWriter(model, aligner, min_coverage=args.ctc_min_coverage, min_accuracy=args.ctc_min_accuracy) t0 = time.perf_counter() sys.stderr.write("> calling\n") with writer, ctc_writer, reader, torch.no_grad(): while True: read = reader.queue.get() if read is None: break if len(read.signal) > max_read_size: sys.stderr.write("> skipping long read %s (%s samples)\n" % (read.read_id, len(read.signal))) continue num_reads += 1 samples += len(read.signal) raw_data = torch.tensor(read.signal.astype(dtype)) print('bonito: raw_data.shape: ', raw_data.shape) chunks = chunk(raw_data, args.chunksize, args.overlap) posteriors_ = model(chunks.to(args.device)).cpu().numpy() posteriors = stitch(posteriors_, args.overlap // model.stride // 2) if args.write_basecall: writer.queue.put((read, posteriors[:raw_data.shape[0]])) if args.save_ctc and len(raw_data) > args.chunksize: ctc_writer.queue.put((chunks.numpy(), posteriors_)) print('bonito: posteriors.shape', posteriors.shape) posteriors.tofile(args.post_file) duration = time.perf_counter() - t0 sys.stderr.write("> completed reads: %s\n" % num_reads) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (samples / duration)) sys.stderr.write("> done\n")