def main(args): if args.save_ctc and not args.reference: sys.stderr.write("> a reference is needed to output ctc training data\n") exit(1) sys.stderr.write("> loading model\n") model = load_model(args.model_directory, args.device, weights=int(args.weights)) if args.reference: sys.stderr.write("> loading reference\n") aligner = Aligner(args.reference, preset='ont-map') if not aligner: sys.stderr.write("> failed to load/build index\n") exit(1) else: aligner = None reads = get_reads( args.reads_directory, n_proc=8, recursive=args.recursive, read_ids=column_to_set(args.read_ids), skip=args.skip, ) basecall = load_symbol(args.model_directory, "basecall") if args.save_ctc: reads = ( chunk for read in reads if len(read.signal) >= 3600 for chunk in read_chunks(read) ) basecalls = basecall(model, reads, aligner=aligner, qscores=args.fastq, batchsize=64) writer = CTCWriter( tqdm(basecalls, desc="> calling", unit=" reads", leave=False), aligner, args.ctc_min_coverage, args.ctc_min_accuracy ) else: basecalls = basecall(model, reads, aligner=aligner, qscores=args.fastq) writer = Writer( tqdm(basecalls, desc="> calling", unit=" reads", leave=False), aligner, fastq=args.fastq ) t0 = perf_counter() writer.start() writer.join() duration = perf_counter() - t0 num_samples = sum(num_samples for read_id, num_samples in writer.log) sys.stderr.write("> completed reads: %s\n" % len(writer.log)) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (num_samples / duration)) sys.stderr.write("> done\n")
def main(args): init(args.seed, args.device) if args.model_directory in models and args.model_directory not in os.listdir( __models__): sys.stderr.write("> downloading model\n") File(__models__, models[args.model_directory]).download() sys.stderr.write(f"> loading model {args.model_directory}\n") try: model = load_model( args.model_directory, args.device, weights=int(args.weights), chunksize=args.chunksize, overlap=args.overlap, batchsize=args.batchsize, quantize=args.quantize, use_koi=True, ) except FileNotFoundError: sys.stderr.write(f"> error: failed to load {args.model_directory}\n") sys.stderr.write(f"> available models:\n") for model in sorted(models): sys.stderr.write(f" - {model}\n") exit(1) if args.verbose: sys.stderr.write( f"> model basecaller params: {model.config['basecaller']}\n") basecall = load_symbol(args.model_directory, "basecall") mods_model = None if args.modified_base_model is not None or args.modified_bases is not None: sys.stderr.write("> loading modified base model\n") mods_model = load_mods_model(args.modified_bases, args.model_directory, args.modified_base_model) sys.stderr.write(f"> {mods_model[1]['alphabet_str']}\n") if args.reference: sys.stderr.write("> loading reference\n") aligner = Aligner(args.reference, preset='ont-map', best_n=1) if not aligner: sys.stderr.write("> failed to load/build index\n") exit(1) else: aligner = None fmt = biofmt(aligned=args.reference is not None) if args.reference and args.reference.endswith( ".mmi") and fmt.name == "cram": sys.stderr.write( "> error: reference cannot be a .mmi when outputting cram\n") exit(1) elif args.reference and fmt.name == "fastq": sys.stderr.write( f"> warning: did you really want {fmt.aligned} {fmt.name}?\n") else: sys.stderr.write(f"> outputting {fmt.aligned} {fmt.name}\n") if args.save_ctc and not args.reference: sys.stderr.write( "> a reference is needed to output ctc training data\n") exit(1) if fmt.name != 'fastq': groups = get_read_groups(args.reads_directory, args.model_directory, n_proc=8, recursive=args.recursive, read_ids=column_to_set(args.read_ids), skip=args.skip, cancel=process_cancel()) else: groups = [] reads = get_reads(args.reads_directory, n_proc=8, recursive=args.recursive, read_ids=column_to_set(args.read_ids), skip=args.skip, cancel=process_cancel()) if args.max_reads: reads = take(reads, args.max_reads) if args.save_ctc: reads = (chunk for read in reads for chunk in read_chunks( read, chunksize=model.config["basecaller"]["chunksize"], overlap=model.config["basecaller"]["overlap"])) ResultsWriter = CTCWriter else: ResultsWriter = Writer results = basecall(model, reads, reverse=args.revcomp, batchsize=model.config["basecaller"]["batchsize"], chunksize=model.config["basecaller"]["chunksize"], overlap=model.config["basecaller"]["overlap"]) if mods_model is not None: results = process_itemmap(partial(call_mods, mods_model), results) if aligner: results = align_map(aligner, results, n_thread=os.cpu_count()) writer = ResultsWriter( fmt.mode, tqdm(results, desc="> calling", unit=" reads", leave=False), aligner=aligner, group_key=args.model_directory, ref_fn=args.reference, groups=groups, ) t0 = perf_counter() writer.start() writer.join() duration = perf_counter() - t0 num_samples = sum(num_samples for read_id, num_samples in writer.log) sys.stderr.write("> completed reads: %s\n" % len(writer.log)) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (num_samples / duration)) sys.stderr.write("> done\n")
def main(args): if args.save_ctc and not args.reference: sys.stderr.write( "> a reference is needed to output ctc training data\n") exit(1) if args.save_ctc: args.overlap = 900 args.chunksize = 3600 sys.stderr.write("> loading model\n") model = load_model( args.model_directory, args.device, weights=int(args.weights), half=args.half, chunksize=args.chunksize, use_rt=args.cudart, ) if args.reference: sys.stderr.write("> loading reference\n") aligner = Aligner(args.reference, preset='ont-map') if not aligner: sys.stderr.write("> failed to load/build index\n") sys.exit(1) write_sam_header(aligner) else: aligner = None with open(summary_file(), 'w') as summary: write_summary_header(summary, alignment=aligner) samples = 0 num_reads = 0 max_read_size = 4e6 read_ids = column_to_set(args.read_ids) dtype = np.float16 if args.half else np.float32 reader = ProcessIterator(get_reads(args.reads_directory, read_ids=read_ids, skip=args.skip), progress=True) writer = ProcessPool(DecoderWriter, model=model, aligner=aligner, beamsize=args.beamsize, fastq=args.fastq) ctc_writer = CTCWriter(model, aligner, min_coverage=args.ctc_min_coverage, min_accuracy=args.ctc_min_accuracy) t0 = time.perf_counter() sys.stderr.write("> calling\n") with writer, ctc_writer, reader, torch.no_grad(): while True: read = reader.queue.get() if read is None: break if len(read.signal) > max_read_size: sys.stderr.write("> skipping long read %s (%s samples)\n" % (read.read_id, len(read.signal))) continue num_reads += 1 samples += len(read.signal) raw_data = torch.tensor(read.signal.astype(dtype)) chunks = chunk(raw_data, args.chunksize, args.overlap) posteriors_ = model(chunks.to(args.device)).cpu().numpy() posteriors = stitch(posteriors_, args.overlap // model.stride // 2) writer.queue.put((read, posteriors[:raw_data.shape[0]])) if args.save_ctc and len(raw_data) > args.chunksize: ctc_writer.queue.put((chunks.numpy(), posteriors_)) duration = time.perf_counter() - t0 sys.stderr.write("> completed reads: %s\n" % num_reads) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (samples / duration)) sys.stderr.write("> done\n")