def raw_chunkify_with_identity_main(args): """ Main function for `chunkify.py raw_identity` producing batch file for model training """ if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) fast5_files = fast5.iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) print('* Processing data using', args.jobs, 'threads') kwarg_names = [ 'chunk_len', 'kmer_len', 'min_length', 'trim', 'normalisation', 'downsample_factor', 'interpolation' ] i = 0 bad_list = [] chunk_list = [] label_list = [] for res in imap_mp(raw_chunk_worker, fast5_files, threads=args.jobs, unordered=True, fix_kwargs=util.get_kwargs(args, kwarg_names), init=batch.init_chunk_identity_worker, initargs=[args.kmer_len, args.alphabet]): if res is not None: i = util.progress_report(i) (chunks, labels, bad_ev) = res chunk_list.append(chunks) label_list.append(labels) bad_list.append(bad_ev) if chunk_list == []: print("no chunks were produced", file=sys.stderr) sys.exit(1) else: print('\n* Writing out to HDF5') hdf5_attributes = { 'chunk': args.chunk_len, 'downsample_factor': args.downsample_factor, 'input_type': 'raw', 'interpolation': args.interpolation, 'kmer': args.kmer_len, 'normalisation': args.normalisation, 'section': 'template', 'trim': args.trim, 'alphabet': args.alphabet, } blanks_per_chunk = np.concatenate([(l == 0).mean(1) for l in label_list]) blanks = np.percentile(blanks_per_chunk, args.blanks_percentile) util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes, chunk_list, label_list, bad_list)
def main(argv): args = parser.parse_args(argv[1:]) if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) fast5_files = fast5.iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) print('* Processing data using', args.jobs, 'threads') i = 0 kwarg_names = ['section'] with open(args.output, 'w') as file_handle: for res in imap_mp(reference_extraction_worker, fast5_files, threads=args.jobs, unordered=True, fix_kwargs=util.get_kwargs(args, kwarg_names)): if res is not None: i = util.progress_report(i) file_name, reference = res header = '>{}\n'.format( os.path.basename(os.path.splitext(file_name)[0])) file_handle.write(header) file_handle.write(reference + '\n')
def chunkify_with_identity_main(args): if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) fast5_files = fast5.iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) print('* Processing data using', args.jobs, 'threads') kwarg_names = ['section', 'chunk_len', 'kmer_len', 'min_length', 'trim', 'use_scaled', 'normalisation'] i = 0 bad_list = [] chunk_list = [] label_list = [] for res in imap_mp(batch.chunk_worker, fast5_files, threads=args.jobs, unordered=True, fix_kwargs=util.get_kwargs(args, kwarg_names), init=batch.init_chunk_identity_worker, initargs=[args.kmer_len, args.alphabet]): if res is not None: i = util.progress_report(i) (chunks, labels, bad_ev) = res chunk_list.append(chunks) label_list.append(labels) bad_list.append(bad_ev) if chunk_list == []: print("no chunks were produced", file=sys.stderr) sys.exit(1) else: print('\n* Writing out to HDF5') hdf5_attributes = { 'chunk': args.chunk_len, 'input_type': 'events', 'kmer': args.kmer_len, 'normalisation': args.normalisation, 'scaled': args.use_scaled, 'section': args.section, 'trim': args.trim, 'alphabet': args.alphabet, } util.create_labelled_chunks_hdf5(args.output, args.blanks, hdf5_attributes, chunk_list, label_list, bad_list)
def chunkify_with_remap_main(args): if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) if os.path.exists(args.output_strand_list): print("Cowardly refusing to overwrite {}".format( args.output_strand_list)) sys.exit(2) fast5_files = fast5.iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) references = util.fasta_file_to_dict(args.references) print('* Processing data using', args.jobs, 'threads') kwarg_names = [ 'trim', 'min_prob', 'kmer_len', 'min_length', 'prior', 'slip', 'chunk_len', 'use_scaled', 'normalisation', 'section', 'segmentation' ] kwargs = util.get_kwargs(args, kwarg_names) kwargs['references'] = references i = 0 compiled_file = helpers.compile_model(args.model, args.compile) bad_list = [] chunk_list = [] label_list = [] if not os.path.isfile(args.output_strand_list): header_line = '\t'.join([ 'filename', 'nev', 'score', 'nstay', 'seqlen', 'start', 'end' ]) + u'\n' with open(args.output_strand_list, 'wt') as slfh: slfh.write(header_line) for res in imap_mp(batch.chunk_remap_worker, fast5_files, threads=args.jobs, fix_kwargs=kwargs, unordered=True, init=batch.init_chunk_remap_worker, initargs=[compiled_file, args.kmer_len, args.alphabet]): if res is not None: i = util.progress_report(i) read, score, nev, path, seq, chunks, labels, bad_ev = res chunk_list.append(chunks) label_list.append(labels) bad_list.append(bad_ev) strand_data = [ read, nev, -score / nev, np.sum(np.ediff1d(path, to_begin=1) == 0), len(seq), min(path), max(path) ] data_line = '\t'.join([str(x) for x in strand_data]) + '\n' with open(args.output_strand_list, 'at') as slfh: slfh.write(data_line) if compiled_file != args.compile: os.remove(compiled_file) if chunk_list == []: print("no chunks were produced", file=sys.stderr) sys.exit(1) else: print('\n* Creating HDF5 file') hdf5_attributes = { 'chunk': args.chunk_len, 'input_type': 'events', 'kmer': args.kmer_len, 'normalisation': args.normalisation, 'scaled': args.use_scaled, 'section': args.section, 'trim': args.trim, 'alphabet': args.alphabet, } util.create_labelled_chunks_hdf5(args.output, args.blanks, hdf5_attributes, chunk_list, label_list, bad_list)
def raw_chunkify_with_remap_main(args): """ Main function for `chunkify.py raw_remap` producing batch file for model training """ if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) if os.path.exists(args.output_strand_list): print("Cowardly refusing to overwrite {}".format(args.output_strand_list)) sys.exit(2) fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) references = util.fasta_file_to_dict(args.references) print('* Processing data using', args.jobs, 'threads') kwarg_names = ['trim', 'min_prob', 'kmer_len', 'min_length', 'prior', 'slip', 'chunk_len', 'normalisation', 'downsample_factor', 'interpolation', 'open_pore_fraction'] kwargs = util.get_kwargs(args, kwarg_names) kwargs['references'] = references i = 0 compiled_file = helpers.compile_model(args.model, args.compile) bad_list = [] chunk_list = [] label_list = [] if not os.path.isfile(args.output_strand_list): header_line = '\t'.join(['filename', 'nblocks', 'score', 'nstay', 'seqlen', 'start', 'end']) + '\n' with open(args.output_strand_list, 'wt') as slfh: slfh.write(header_line) for res in imap_mp(raw_chunk_remap_worker, fast5_files, threads=args.jobs, fix_kwargs=kwargs, unordered=True, init=batch.init_chunk_remap_worker, initargs=[compiled_file, args.kmer_len, args.alphabet]): if res is not None: i = util.progress_report(i) read, score, nblocks, path, seq, chunks, labels, bad_ev = res chunk_list.append(chunks) label_list.append(labels) bad_list.append(bad_ev) strand_data = [read, nblocks, -score / nblocks, np.sum(np.ediff1d(path, to_begin=1) == 0), len(seq), min(path), max(path)] data_line = '\t'.join([str(x) for x in strand_data]) + '\n' with open(args.output_strand_list, 'at') as slfh: slfh.write(data_line) if compiled_file != args.compile: os.remove(compiled_file) if chunk_list == []: print("no chunks were produced", file=sys.stderr) sys.exit(1) else: print('\n* Writing out to HDF5') hdf5_attributes = { 'chunk': args.chunk_len, 'downsample_factor': args.downsample_factor, 'input_type': 'raw', 'interpolation': args.interpolation, 'kmer': args.kmer_len, 'normalisation': args.normalisation, 'section': 'template', 'trim': args.trim, 'alphabet': args.alphabet, } blanks_per_chunk = np.concatenate([(l == 0).mean(1) for l in label_list]) blanks = np.percentile(blanks_per_chunk, args.blanks_percentile) util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes, chunk_list, label_list, bad_list)
basecall_worker = getattr(basecall, args.command + "_worker") if args.command == "events": kwarg_names = ['section', 'segmentation', 'trim', 'kmer_len', 'transducer', 'bad', 'min_prob', 'skip', 'trans', 'alphabet'] else: kwarg_names = ['trim', 'open_pore_fraction', 'kmer_len', 'transducer', 'bad', 'min_prob', 'skip', 'trans', 'alphabet'] compiled_file = helpers.compile_model(args.model, args.compile) seq_printer = basecall.SeqPrinter(args.kmer_len, datatype=args.datatype, transducer=args.transducer, alphabet=args.alphabet.decode('ascii')) files = fast5.iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) nbases = nevents = 0 t0 = time.time() for res in imap_mp(basecall_worker, files, threads=args.jobs, fix_kwargs=util.get_kwargs(args, kwarg_names), unordered=True, init=basecall.init_worker, initargs=[compiled_file]): if res is None: continue read, score, call, nev = res seq_len = seq_printer.write(read, score, call, nev) nbases += seq_len nevents += nev dt = time.time() - t0 t = 'Called {} bases in {:.1f} s ({:.1f} bases/s or {:.1f} {}/s)\n' sys.stderr.write(t.format(nbases, dt, nbases / dt, nevents / dt, args.datatype)) if compiled_file != args.compile: os.remove(compiled_file)
seq_printer = basecall.SeqPrinter(args.kmer_len, datatype=args.datatype, transducer=args.transducer, alphabet=args.alphabet.decode('ascii')) files = iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) nbases = nevents = 0 t0 = time.time() for res in imap_mp(basecall_worker, files, threads=args.jobs, fix_kwargs=util.get_kwargs(args, kwarg_names), unordered=True, init=basecall.init_worker, initargs=[compiled_file]): if res is None: continue read, score, call, nev = res seq_len = seq_printer.write(read, score, call, nev) nbases += seq_len nevents += nev dt = time.time() - t0 t = 'Called {} bases in {:.1f} s ({:.1f} bases/s or {:.1f} {}/s)\n' sys.stderr.write( t.format(nbases, dt, nbases / dt, nevents / dt, args.datatype))