def _main(args): try: mh.mkdir(args.guppy_logs_output_directory, False) except mh.MegaError: LOGGER.warning( "Guppy logs output directory exists. Potentially overwriting " + "guppy logs.") logging.init_logger(args.guppy_logs_output_directory) # add required attributes for loading guppy, but not valid options for # this script. args.do_not_use_guppy_server = False args.output_directory = args.guppy_logs_output_directory args.outputs = [mh.PR_VAR_NAME] LOGGER.info("Loading model.") backend_params = backends.parse_backend_params(args) with backends.ModelInfo(backend_params, args.processes) as model_info: LOGGER.info("Loading reference.") aligner = mappy.Aligner(str(args.reference), preset=str("map-ont"), best_n=1) process_all_reads( args.fast5s_dir, not args.not_recursive, args.num_reads, args.read_ids_filename, model_info, aligner, args.processes, args.output, args.suppress_progress, args.compute_false_reference_scores, )
def open_pyguppy_backend(args): args.do_not_use_guppy_server = False try: mh.mkdir(args.output_directory, False) except mh.MegaError: LOGGER.warning( "Guppy logs output directory exists. Potentially overwriting guppy " "logs." ) backend_params = backends.parse_backend_params(args) model_info = None try: model_info = backends.ModelInfo(backend_params, args.processes) # if spawning multiple workers run this inside newly spawned processes model_info.prep_model_worker() LOGGER.info(model_info.get_alphabet_str()) LOGGER.info( "Model structure:\n\tStride: {}\n\tState size: {}".format( model_info.stride, model_info.output_size ) ) # use model_info.iter_basecalled_reads to basecall reads and return # relevant signal anchored information. model_info.client.disconnect() finally: # ensure guppy server is closed in finally block if model_info is not None: model_info.close()
def main(): args = get_parser().parse_args() # add required attributes for loading guppy, but not valid options for # this script. args.do_not_use_guppy_server = False args.output_directory = args.guppy_logs_output_directory try: mh.mkdir(args.output_directory, False) except mh.MegaError: sys.stderr.write( '***** WARNING ***** Guppy logs output directory exists. ' + 'Potentially overwriting guppy logs.\n') sys.stderr.write('Loading model.\n') backend_params = backends.parse_backend_params(args) with backends.ModelInfo(backend_params, args.processes) as model_info: sys.stderr.write('Loading reference.\n') aligner = mapping.alignerPlus(str(args.reference), preset=str('map-ont'), best_n=1) process_all_reads(args.fast5s_dir, args.num_reads, args.read_ids_filename, model_info, aligner, args.processes, args.output, args.suppress_progress, args.compute_false_reference_scores)
def _main(args): logging.init_logger(log_fn=args.log_filename, quiet=args.quiet) # add required attributes for loading guppy, but not valid options for # this script. args.do_not_use_guppy_server = False args.output_directory = args.guppy_logs_output_directory try: mh.mkdir(args.output_directory, False) except mh.MegaError: LOGGER.warning( "Guppy logs output directory exists. Potentially overwriting " + "guppy logs." ) args = add_trim_guppy_none(args) args.outputs = [mh.PR_MOD_NAME] # make edge_buffer >= context_bases to simplify processing if args.edge_buffer < args.mod_context_bases: LOGGER.warning( "[--edge-buffer] less than [--mod-context-bases]. Setting " + "[--edge-buffer] to value from [--mod-context-bases]" ) args.edge_buffer = args.mod_context_bases LOGGER.info("Loading model.") backend_params = backends.parse_backend_params(args) with backends.ModelInfo(backend_params, args.processes) as model_info: check_map_sig_alphabet(model_info, args.mapped_signal_file) motifs = parse_motifs(args.motif, model_info, args.modified_bases_set) can_labs, mod_labs = extract_label_conversions(model_info) can_post_indices = model_info.can_indices.astype(np.uintp) all_mod_llrs, all_can_llrs = compute_diff_scores( args.mapped_signal_file, model_info, args.mod_context_bases, args.edge_buffer, args.num_reads, motifs, can_labs, mod_labs, can_post_indices, ) mod_summary = [ ( mod, len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0, len(all_can_llrs[mod]) if mod in all_can_llrs else 0, ) for mod in set(all_mod_llrs).union(all_can_llrs) ] LOGGER.info( "Data summary:\n\tmod\tmod_N\tcan_N\n" + "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary) ) output_mods_data(all_mod_llrs, all_can_llrs, args.out_filename)
def _main(args): mh.mkdir(args.output_megalodon_results_dir, args.overwrite) logging.init_logger(args.output_megalodon_results_dir) logger = logging.get_logger() logger.info('Opening new sequence variant statistics database') out_vars_db = variants.VarsDb( mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_VAR_NAME), read_only=False, loc_index_in_memory=not args.var_locations_on_disk, uuid_index_in_memory=True) for mega_dir in args.megalodon_results_dirs: logger.info( 'Adding sequence variant statistics from {}'.format(mega_dir)) # full read only mode with no indices read into memory vars_db = variants.VarsDb(mh.get_megalodon_fn(mega_dir, mh.PR_VAR_NAME), read_only=True, chrm_index_in_memory=False, alt_index_in_memory=False, uuid_index_in_memory=False) bar = tqdm(desc=mega_dir, total=vars_db.get_num_uniq_stats(), smoothing=0, dynamic_ncols=True) for (score, uuid, strand, alt_seq, ref_seq, pos, var_name, test_end, test_start, chrm, chrm_len) in vars_db.iter_data(): chrm_id = out_vars_db.get_chrm_id_or_insert(chrm, chrm_len) loc_id = out_vars_db.get_loc_id_or_insert(chrm_id, test_start, test_end, pos, ref_seq, var_name) alt_id = out_vars_db.get_alt_id_or_insert(alt_seq) read_id = out_vars_db.get_read_id_or_insert(uuid) out_vars_db.insert_data(score, loc_id, alt_id, read_id) bar.update() bar.close() logger.info('Creating indices and closing database') if out_vars_db.chrm_idx_in_mem: out_vars_db.create_chrm_index() if out_vars_db.loc_idx_in_mem: out_vars_db.create_loc_index() if out_vars_db.alt_idx_in_mem: out_vars_db.create_alt_index() out_vars_db.create_data_covering_index() out_vars_db.close()
def _main(args): mh.mkdir(args.output_megalodon_results_dir, args.overwrite) logging.init_logger(args.output_megalodon_results_dir) LOGGER.info('Opening new modified base statistics database') out_mods_db_fn = mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_MOD_NAME) out_mods_db = mods.ModsDb( out_mods_db_fn, read_only=False, init_db_tables=True, in_mem_chrm_to_dbid=True, in_mem_mod_to_dbid=True, in_mem_uuid_to_dbid=True, in_mem_pos_to_dbid=True, force_uint32_pos_to_dbid=args.force_uint32_pos_index, db_safety=args.database_safety) in_mod_db_fns = [mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME) for mega_dir in args.megalodon_results_dirs] LOGGER.info( 'Merging will proceed in five stages:\n\t1) chrmosomes\n\t2) ' + 'modified base definitions\n\t3) read identifiers\n\t4) reference ' + 'positions\n\t5) modified base statistics') insert_chrms(in_mod_db_fns, out_mods_db) insert_mods(in_mod_db_fns, out_mods_db) if args.single_process: insert_reads(in_mod_db_fns, out_mods_db) else: insert_reads_mp(in_mod_db_fns, out_mods_db, args.data_batch_size) if args.single_process: insert_pos(in_mod_db_fns, out_mods_db, args.data_batch_size) else: insert_pos_mp(in_mod_db_fns, out_mods_db, args.data_batch_size) out_mods_db.db.commit() if args.single_process: insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size) else: insert_data_mp( in_mod_db_fns, out_mods_db, out_mods_db_fn, args.data_batch_size, args.max_processes, args.force_uint32_pos_index, db_safety=args.database_safety) out_mods_db.db.commit() LOGGER.info( 'Creating data covering index for efficient searching by position') out_mods_db.create_data_covering_index() out_mods_db.db.commit() out_mods_db.close()
def _main(args): mh.mkdir(args.output_megalodon_results_dir, args.overwrite) logging.init_logger(args.output_megalodon_results_dir) LOGGER.info('Extracting mods and chrms from input databases') in_mod_db_fns = [ mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME) for mega_dir in args.megalodon_results_dirs ] alphabet, mod_long_names = extract_mods(in_mod_db_fns) ref_names_and_lens = extract_chrms(in_mod_db_fns) LOGGER.info('Opening new per-read modified base statistics database') model_info = backends.DetachedModelInfo(alphabet=alphabet, mod_long_names=mod_long_names) mods_info = mods.ModInfo(model_info, out_dir=args.output_megalodon_results_dir) mods.init_mods_db(mods_info, ref_names_and_lens) # load uuids in memory in main out db only in single process mode. # else worker threads only have to load uuid lookup tables out_mods_db = mods.ModsDb(mods_info.mods_db_fn, read_only=False, in_mem_uuid_to_dbid=args.single_process) LOGGER.info('Inserting read UUIDs from input databases') if args.single_process: insert_reads(in_mod_db_fns, out_mods_db) else: insert_reads_mp(in_mod_db_fns, out_mods_db) # commit so read uuids are available to worker processes out_mods_db.commit() LOGGER.info('Inserting per-read calls from input databases') if args.single_process: insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size) else: insert_data_mp(in_mod_db_fns, out_mods_db, mods_info.mods_db_fn, args.data_batch_size, args.max_processes) out_mods_db.commit() LOGGER.info( 'Creating data covering index for efficient iteration by position') out_mods_db.create_data_covering_index() out_mods_db.commit() out_mods_db.close()
def _main(args): logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix) # parse motifs motifs = parse_motifs(args.motif) # open indexed FASTA reference ref = pysam.FastaFile(args.reference) LOGGER.info('Extracting mods and chrms from input database') in_mods_db = mods.ModsDb( mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME)) alphabet, _, mod_long_names = in_mods_db.get_alphabet_info() ref_names_and_lens = list(zip(*in_mods_db.iter_chrms()))[1:] LOGGER.info('Extracting read uuid table') in_uuids = [uuid for _, uuid in in_mods_db.iter_uuids()] LOGGER.info('Opening new per-read modified base statistics databases') model_info = backends.DetachedModelInfo(alphabet=alphabet, mod_long_names=mod_long_names) out_mods_dbs = [] for motif_info in motifs: out_dir = '{}.{}_{}'.format(args.output_prefix, motif_info.raw_motif, motif_info.bases_before) mh.mkdir(out_dir, overwrite=False) mods_info = mods.ModInfo(model_info, out_dir=out_dir) mods.init_mods_db(mods_info, ref_names_and_lens) out_mods_dbs.append((mods.ModsDb(mods_info.mods_db_fn, read_only=False), motif_info)) out_mods_dbs[-1][0].insert_uuids(in_uuids) out_mods_dbs[-1][0].commit() # commit so read uuids are available to worker processes LOGGER.info('Inserting per-read calls from input databases') split_data(in_mods_db, out_mods_dbs, ref) # TOOD do this in separate processes LOGGER.info( 'Creating data covering indices for efficient iteration by position') for out_mods_db, _ in out_mods_dbs: out_mods_db.create_data_covering_index() out_mods_db.commit() out_mods_db.close() LOGGER.info('Finished indexing {}'.format(out_mods_db.fn))
def _main(args): logging.init_logger() # set args that are not relevant to alphabet args.devices = None # set guppy args args.guppy_server_port = None args.guppy_timeout = mh.DEFAULT_GUPPY_TIMEOUT args.output_directory = args.guppy_logs_output_directory # set taiyaki args args.chunk_size = 1000 args.chunk_overlap = 100 args.max_concurrent_chunks = 200 try: mh.mkdir(args.output_directory, False) except mh.MegaError: LOGGER.warning( 'Guppy logs output directory exists. Potentially overwriting ' + 'guppy logs.') backend_params = backends.parse_backend_params(args) with backends.ModelInfo(backend_params, 1) as model_info: if model_info.is_cat_mod: can_bs = [ can_b for mod_b, _ in model_info.mod_long_names for can_b, can_mod_bs in model_info.can_base_mods.items() if mod_b in can_mod_bs ] LOGGER.info( ('Model contains canonical alphabet {} and modified ' + 'bases {}.').format( model_info.can_alphabet, '; '.join('{}={} (alt to {})'.format(mod_b, mln, can_b) for (mod_b, mln), can_b in zip( model_info.mod_long_names, can_bs)))) else: LOGGER.info('Model contains canonical alphabet {}.'.format( model_info.can_alphabet))
def _main(args): try: mh.mkdir(args.output_directory, False) except mh.MegaError: LOGGER.warning( 'Guppy logs output directory exists. Potentially overwriting ' + 'guppy logs.') logging.init_logger(args.log_directory) # set args that are not relevant to alphabet args.devices = None # set guppy args args.guppy_server_port = None args.guppy_timeout = mh.DEFAULT_GUPPY_TIMEOUT args.output_directory = args.guppy_logs_output_directory # set taiyaki args args.chunk_size = 1000 args.chunk_overlap = 100 args.max_concurrent_chunks = 200 backend_params = backends.parse_backend_params(args) with backends.ModelInfo(backend_params, 1) as model_info: LOGGER.info(model_info.get_alphabet_str())