def main(args): if args.save_ctc and not args.reference: sys.stderr.write("> a reference is needed to output ctc training data\n") exit(1) sys.stderr.write("> loading model\n") model = load_model(args.model_directory, args.device, weights=int(args.weights)) if args.reference: sys.stderr.write("> loading reference\n") aligner = Aligner(args.reference, preset='ont-map') if not aligner: sys.stderr.write("> failed to load/build index\n") exit(1) else: aligner = None reads = get_reads( args.reads_directory, n_proc=8, recursive=args.recursive, read_ids=column_to_set(args.read_ids), skip=args.skip, ) basecall = load_symbol(args.model_directory, "basecall") if args.save_ctc: reads = ( chunk for read in reads if len(read.signal) >= 3600 for chunk in read_chunks(read) ) basecalls = basecall(model, reads, aligner=aligner, qscores=args.fastq, batchsize=64) writer = CTCWriter( tqdm(basecalls, desc="> calling", unit=" reads", leave=False), aligner, args.ctc_min_coverage, args.ctc_min_accuracy ) else: basecalls = basecall(model, reads, aligner=aligner, qscores=args.fastq) writer = Writer( tqdm(basecalls, desc="> calling", unit=" reads", leave=False), aligner, fastq=args.fastq ) t0 = perf_counter() writer.start() writer.join() duration = perf_counter() - t0 num_samples = sum(num_samples for read_id, num_samples in writer.log) sys.stderr.write("> completed reads: %s\n" % len(writer.log)) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (num_samples / duration)) sys.stderr.write("> done\n")
def main(args): init(args.seed, args.device) if args.model_directory in models and args.model_directory not in os.listdir( __models__): sys.stderr.write("> downloading model\n") File(__models__, models[args.model_directory]).download() sys.stderr.write(f"> loading model {args.model_directory}\n") try: model = load_model( args.model_directory, args.device, weights=int(args.weights), chunksize=args.chunksize, overlap=args.overlap, batchsize=args.batchsize, quantize=args.quantize, use_koi=True, ) except FileNotFoundError: sys.stderr.write(f"> error: failed to load {args.model_directory}\n") sys.stderr.write(f"> available models:\n") for model in sorted(models): sys.stderr.write(f" - {model}\n") exit(1) if args.verbose: sys.stderr.write( f"> model basecaller params: {model.config['basecaller']}\n") basecall = load_symbol(args.model_directory, "basecall") mods_model = None if args.modified_base_model is not None or args.modified_bases is not None: sys.stderr.write("> loading modified base model\n") mods_model = load_mods_model(args.modified_bases, args.model_directory, args.modified_base_model) sys.stderr.write(f"> {mods_model[1]['alphabet_str']}\n") if args.reference: sys.stderr.write("> loading reference\n") aligner = Aligner(args.reference, preset='ont-map', best_n=1) if not aligner: sys.stderr.write("> failed to load/build index\n") exit(1) else: aligner = None fmt = biofmt(aligned=args.reference is not None) if args.reference and args.reference.endswith( ".mmi") and fmt.name == "cram": sys.stderr.write( "> error: reference cannot be a .mmi when outputting cram\n") exit(1) elif args.reference and fmt.name == "fastq": sys.stderr.write( f"> warning: did you really want {fmt.aligned} {fmt.name}?\n") else: sys.stderr.write(f"> outputting {fmt.aligned} {fmt.name}\n") if args.save_ctc and not args.reference: sys.stderr.write( "> a reference is needed to output ctc training data\n") exit(1) if fmt.name != 'fastq': groups = get_read_groups(args.reads_directory, args.model_directory, n_proc=8, recursive=args.recursive, read_ids=column_to_set(args.read_ids), skip=args.skip, cancel=process_cancel()) else: groups = [] reads = get_reads(args.reads_directory, n_proc=8, recursive=args.recursive, read_ids=column_to_set(args.read_ids), skip=args.skip, cancel=process_cancel()) if args.max_reads: reads = take(reads, args.max_reads) if args.save_ctc: reads = (chunk for read in reads for chunk in read_chunks( read, chunksize=model.config["basecaller"]["chunksize"], overlap=model.config["basecaller"]["overlap"])) ResultsWriter = CTCWriter else: ResultsWriter = Writer results = basecall(model, reads, reverse=args.revcomp, batchsize=model.config["basecaller"]["batchsize"], chunksize=model.config["basecaller"]["chunksize"], overlap=model.config["basecaller"]["overlap"]) if mods_model is not None: results = process_itemmap(partial(call_mods, mods_model), results) if aligner: results = align_map(aligner, results, n_thread=os.cpu_count()) writer = ResultsWriter( fmt.mode, tqdm(results, desc="> calling", unit=" reads", leave=False), aligner=aligner, group_key=args.model_directory, ref_fn=args.reference, groups=groups, ) t0 = perf_counter() writer.start() writer.join() duration = perf_counter() - t0 num_samples = sum(num_samples for read_id, num_samples in writer.log) sys.stderr.write("> completed reads: %s\n" % len(writer.log)) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (num_samples / duration)) sys.stderr.write("> done\n")
def main(args): sys.stderr.write("> loading model\n") model = load_model(args.model, args.device) if args.reference: sys.stderr.write("> loading reference\n") aligner = Aligner(args.reference, preset='ont-map') if not aligner: sys.stderr.write("> failed to load/build index\n") exit(1) else: aligner = None if args.summary: sys.stderr.write("> finding follow on strands\n") pairs = pd.read_csv(args.summary, '\t', low_memory=False) pairs = pairs[pairs.sequence_length_template.gt(0)] if 'filename' in pairs.columns: pairs = pairs.rename(columns={'filename': 'filename_fast5'}) if 'alignment_strand_coverage' in pairs.columns: pairs = pairs.rename( columns={'alignment_strand_coverage': 'alignment_coverage'}) valid_fast5s = [ f for f in pairs.filename_fast5.unique() if ((args.reads_directory / Path(f)).exists()) ] pairs = pairs[pairs.filename_fast5.isin(valid_fast5s)] pairs = find_follow_on(pairs) sys.stderr.write("> found %s follow strands in summary\n" % (len(pairs) // 2)) if args.max_reads > 0: pairs = pairs.head(args.max_reads) temp_reads = pairs.iloc[0::2] comp_reads = pairs.iloc[1::2] else: if args.index is not None: sys.stderr.write("> loading read index\n") index = json.load(open(args.index, 'r')) else: sys.stderr.write("> building read index\n") files = list(glob(os.path.join(args.reads_directory, '*.fast5'))) index = build_index(files, n_proc=8) if args.save_index: with open('bonito-read-id.idx', 'w') as f: json.dump(index, f) pairs = pd.read_csv(args.pairs, sep=args.sep, names=['read_1', 'read_2']) if args.max_reads > 0: pairs = pairs.head(args.max_reads) pairs['file_1'] = pairs['read_1'].apply(index.get) pairs['file_2'] = pairs['read_2'].apply(index.get) pairs = pairs.dropna().reset_index() temp_reads = pairs[['read_1', 'file_1']].rename(columns={ 'read_1': 'read_id', 'file_1': 'filename_fast5' }) comp_reads = pairs[['read_2', 'file_2']].rename(columns={ 'read_2': 'read_id', 'file_2': 'filename_fast5' }) if len(pairs) == 0: print("> no matched pairs found in given directory", file=sys.stderr) exit(1) # https://github.com/clara-parabricks/GenomeWorks/issues/648 with devnull(): CudaPoaBatch(1000, 1000, 3724032) basecalls = call(model, args.reads_directory, temp_reads, comp_reads, aligner=aligner) writer = Writer(tqdm(basecalls, desc="> calling", unit=" reads", leave=False), aligner, duplex=True) t0 = perf_counter() writer.start() writer.join() duration = perf_counter() - t0 num_samples = sum(num_samples for read_id, num_samples in writer.log) print("> duration: %s" % timedelta(seconds=np.round(duration)), file=sys.stderr) print("> samples per second %.1E" % (num_samples / duration), file=sys.stderr)