Ejemplo n.º 1
0
def _main(args):
    try:
        mh.mkdir(args.guppy_logs_output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            "Guppy logs output directory exists. Potentially overwriting " +
            "guppy logs.")
    logging.init_logger(args.guppy_logs_output_directory)
    # add required attributes for loading guppy, but not valid options for
    # this script.
    args.do_not_use_guppy_server = False
    args.output_directory = args.guppy_logs_output_directory
    args.outputs = [mh.PR_VAR_NAME]

    LOGGER.info("Loading model.")
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, args.processes) as model_info:
        LOGGER.info("Loading reference.")
        aligner = mappy.Aligner(str(args.reference),
                                preset=str("map-ont"),
                                best_n=1)

        process_all_reads(
            args.fast5s_dir,
            not args.not_recursive,
            args.num_reads,
            args.read_ids_filename,
            model_info,
            aligner,
            args.processes,
            args.output,
            args.suppress_progress,
            args.compute_false_reference_scores,
        )
Ejemplo n.º 2
0
def open_pyguppy_backend(args):
    args.do_not_use_guppy_server = False
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            "Guppy logs output directory exists. Potentially overwriting guppy "
            "logs."
        )
    backend_params = backends.parse_backend_params(args)
    model_info = None
    try:
        model_info = backends.ModelInfo(backend_params, args.processes)
        # if spawning multiple workers run this inside newly spawned processes
        model_info.prep_model_worker()
        LOGGER.info(model_info.get_alphabet_str())
        LOGGER.info(
            "Model structure:\n\tStride: {}\n\tState size: {}".format(
                model_info.stride, model_info.output_size
            )
        )
        # use model_info.iter_basecalled_reads to basecall reads and return
        # relevant signal anchored information.
        model_info.client.disconnect()
    finally:
        # ensure guppy server is closed in finally block
        if model_info is not None:
            model_info.close()
def main():
    args = get_parser().parse_args()
    # add required attributes for loading guppy, but not valid options for
    # this script.
    args.do_not_use_guppy_server = False
    args.output_directory = args.guppy_logs_output_directory
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        sys.stderr.write(
            '***** WARNING ***** Guppy logs output directory exists. ' +
            'Potentially overwriting guppy logs.\n')

    sys.stderr.write('Loading model.\n')
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, args.processes) as model_info:
        sys.stderr.write('Loading reference.\n')
        aligner = mapping.alignerPlus(str(args.reference),
                                      preset=str('map-ont'),
                                      best_n=1)

        process_all_reads(args.fast5s_dir, args.num_reads,
                          args.read_ids_filename, model_info, aligner,
                          args.processes, args.output, args.suppress_progress,
                          args.compute_false_reference_scores)
Ejemplo n.º 4
0
def _main(args):
    logging.init_logger(log_fn=args.log_filename, quiet=args.quiet)
    # add required attributes for loading guppy, but not valid options for
    # this script.
    args.do_not_use_guppy_server = False
    args.output_directory = args.guppy_logs_output_directory
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            "Guppy logs output directory exists. Potentially overwriting "
            + "guppy logs."
        )
    args = add_trim_guppy_none(args)
    args.outputs = [mh.PR_MOD_NAME]
    # make edge_buffer >= context_bases to simplify processing
    if args.edge_buffer < args.mod_context_bases:
        LOGGER.warning(
            "[--edge-buffer] less than [--mod-context-bases]. Setting "
            + "[--edge-buffer] to value from [--mod-context-bases]"
        )
        args.edge_buffer = args.mod_context_bases

    LOGGER.info("Loading model.")
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, args.processes) as model_info:
        check_map_sig_alphabet(model_info, args.mapped_signal_file)
        motifs = parse_motifs(args.motif, model_info, args.modified_bases_set)
        can_labs, mod_labs = extract_label_conversions(model_info)
        can_post_indices = model_info.can_indices.astype(np.uintp)
        all_mod_llrs, all_can_llrs = compute_diff_scores(
            args.mapped_signal_file,
            model_info,
            args.mod_context_bases,
            args.edge_buffer,
            args.num_reads,
            motifs,
            can_labs,
            mod_labs,
            can_post_indices,
        )

    mod_summary = [
        (
            mod,
            len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0,
            len(all_can_llrs[mod]) if mod in all_can_llrs else 0,
        )
        for mod in set(all_mod_llrs).union(all_can_llrs)
    ]
    LOGGER.info(
        "Data summary:\n\tmod\tmod_N\tcan_N\n"
        + "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary)
    )
    output_mods_data(all_mod_llrs, all_can_llrs, args.out_filename)
Ejemplo n.º 5
0
def _main(args):
    mh.mkdir(args.output_megalodon_results_dir, args.overwrite)
    logging.init_logger(args.output_megalodon_results_dir)
    logger = logging.get_logger()

    logger.info('Opening new sequence variant statistics database')
    out_vars_db = variants.VarsDb(
        mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_VAR_NAME),
        read_only=False,
        loc_index_in_memory=not args.var_locations_on_disk,
        uuid_index_in_memory=True)

    for mega_dir in args.megalodon_results_dirs:
        logger.info(
            'Adding sequence variant statistics from {}'.format(mega_dir))
        # full read only mode with no indices read into memory
        vars_db = variants.VarsDb(mh.get_megalodon_fn(mega_dir,
                                                      mh.PR_VAR_NAME),
                                  read_only=True,
                                  chrm_index_in_memory=False,
                                  alt_index_in_memory=False,
                                  uuid_index_in_memory=False)
        bar = tqdm(desc=mega_dir,
                   total=vars_db.get_num_uniq_stats(),
                   smoothing=0,
                   dynamic_ncols=True)
        for (score, uuid, strand, alt_seq, ref_seq, pos, var_name, test_end,
             test_start, chrm, chrm_len) in vars_db.iter_data():
            chrm_id = out_vars_db.get_chrm_id_or_insert(chrm, chrm_len)
            loc_id = out_vars_db.get_loc_id_or_insert(chrm_id, test_start,
                                                      test_end, pos, ref_seq,
                                                      var_name)
            alt_id = out_vars_db.get_alt_id_or_insert(alt_seq)
            read_id = out_vars_db.get_read_id_or_insert(uuid)
            out_vars_db.insert_data(score, loc_id, alt_id, read_id)
            bar.update()
        bar.close()

    logger.info('Creating indices and closing database')
    if out_vars_db.chrm_idx_in_mem:
        out_vars_db.create_chrm_index()
    if out_vars_db.loc_idx_in_mem:
        out_vars_db.create_loc_index()
    if out_vars_db.alt_idx_in_mem:
        out_vars_db.create_alt_index()
    out_vars_db.create_data_covering_index()
    out_vars_db.close()
Ejemplo n.º 6
0
def _main(args):
    mh.mkdir(args.output_megalodon_results_dir, args.overwrite)
    logging.init_logger(args.output_megalodon_results_dir)

    LOGGER.info('Opening new modified base statistics database')
    out_mods_db_fn = mh.get_megalodon_fn(args.output_megalodon_results_dir,
                                         mh.PR_MOD_NAME)
    out_mods_db = mods.ModsDb(
        out_mods_db_fn, read_only=False, init_db_tables=True,
        in_mem_chrm_to_dbid=True, in_mem_mod_to_dbid=True,
        in_mem_uuid_to_dbid=True, in_mem_pos_to_dbid=True,
        force_uint32_pos_to_dbid=args.force_uint32_pos_index,
        db_safety=args.database_safety)

    in_mod_db_fns = [mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME)
                     for mega_dir in args.megalodon_results_dirs]

    LOGGER.info(
        'Merging will proceed in five stages:\n\t1) chrmosomes\n\t2) ' +
        'modified base definitions\n\t3) read identifiers\n\t4) reference ' +
        'positions\n\t5) modified base statistics')
    insert_chrms(in_mod_db_fns, out_mods_db)
    insert_mods(in_mod_db_fns, out_mods_db)
    if args.single_process:
        insert_reads(in_mod_db_fns, out_mods_db)
    else:
        insert_reads_mp(in_mod_db_fns, out_mods_db, args.data_batch_size)
    if args.single_process:
        insert_pos(in_mod_db_fns, out_mods_db, args.data_batch_size)
    else:
        insert_pos_mp(in_mod_db_fns, out_mods_db, args.data_batch_size)
    out_mods_db.db.commit()
    if args.single_process:
        insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size)
    else:
        insert_data_mp(
            in_mod_db_fns, out_mods_db, out_mods_db_fn, args.data_batch_size,
            args.max_processes, args.force_uint32_pos_index,
            db_safety=args.database_safety)
    out_mods_db.db.commit()

    LOGGER.info(
        'Creating data covering index for efficient searching by position')
    out_mods_db.create_data_covering_index()
    out_mods_db.db.commit()
    out_mods_db.close()
Ejemplo n.º 7
0
def _main(args):
    mh.mkdir(args.output_megalodon_results_dir, args.overwrite)
    logging.init_logger(args.output_megalodon_results_dir)

    LOGGER.info('Extracting mods and chrms from input databases')
    in_mod_db_fns = [
        mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME)
        for mega_dir in args.megalodon_results_dirs
    ]
    alphabet, mod_long_names = extract_mods(in_mod_db_fns)
    ref_names_and_lens = extract_chrms(in_mod_db_fns)

    LOGGER.info('Opening new per-read modified base statistics database')
    model_info = backends.DetachedModelInfo(alphabet=alphabet,
                                            mod_long_names=mod_long_names)
    mods_info = mods.ModInfo(model_info,
                             out_dir=args.output_megalodon_results_dir)
    mods.init_mods_db(mods_info, ref_names_and_lens)

    # load uuids in memory in main out db only in single process mode.
    # else worker threads only have to load uuid lookup tables
    out_mods_db = mods.ModsDb(mods_info.mods_db_fn,
                              read_only=False,
                              in_mem_uuid_to_dbid=args.single_process)

    LOGGER.info('Inserting read UUIDs from input databases')
    if args.single_process:
        insert_reads(in_mod_db_fns, out_mods_db)
    else:
        insert_reads_mp(in_mod_db_fns, out_mods_db)
    # commit so read uuids are available to worker processes
    out_mods_db.commit()
    LOGGER.info('Inserting per-read calls from input databases')
    if args.single_process:
        insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size)
    else:
        insert_data_mp(in_mod_db_fns, out_mods_db, mods_info.mods_db_fn,
                       args.data_batch_size, args.max_processes)
    out_mods_db.commit()

    LOGGER.info(
        'Creating data covering index for efficient iteration by position')
    out_mods_db.create_data_covering_index()
    out_mods_db.commit()
    out_mods_db.close()
Ejemplo n.º 8
0
def _main(args):
    logging.init_logger(args.megalodon_directory,
                        out_suffix=args.output_suffix)

    # parse motifs
    motifs = parse_motifs(args.motif)
    # open indexed FASTA reference
    ref = pysam.FastaFile(args.reference)

    LOGGER.info('Extracting mods and chrms from input database')
    in_mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME))
    alphabet, _, mod_long_names = in_mods_db.get_alphabet_info()
    ref_names_and_lens = list(zip(*in_mods_db.iter_chrms()))[1:]
    LOGGER.info('Extracting read uuid table')
    in_uuids = [uuid for _, uuid in in_mods_db.iter_uuids()]

    LOGGER.info('Opening new per-read modified base statistics databases')
    model_info = backends.DetachedModelInfo(alphabet=alphabet,
                                            mod_long_names=mod_long_names)
    out_mods_dbs = []
    for motif_info in motifs:
        out_dir = '{}.{}_{}'.format(args.output_prefix, motif_info.raw_motif,
                                    motif_info.bases_before)
        mh.mkdir(out_dir, overwrite=False)
        mods_info = mods.ModInfo(model_info, out_dir=out_dir)
        mods.init_mods_db(mods_info, ref_names_and_lens)
        out_mods_dbs.append((mods.ModsDb(mods_info.mods_db_fn,
                                         read_only=False), motif_info))
        out_mods_dbs[-1][0].insert_uuids(in_uuids)
        out_mods_dbs[-1][0].commit()

    # commit so read uuids are available to worker processes
    LOGGER.info('Inserting per-read calls from input databases')
    split_data(in_mods_db, out_mods_dbs, ref)

    # TOOD do this in separate processes
    LOGGER.info(
        'Creating data covering indices for efficient iteration by position')
    for out_mods_db, _ in out_mods_dbs:
        out_mods_db.create_data_covering_index()
        out_mods_db.commit()
        out_mods_db.close()
        LOGGER.info('Finished indexing {}'.format(out_mods_db.fn))
Ejemplo n.º 9
0
def _main(args):
    logging.init_logger()
    # set args that are not relevant to alphabet
    args.devices = None

    # set guppy args
    args.guppy_server_port = None
    args.guppy_timeout = mh.DEFAULT_GUPPY_TIMEOUT
    args.output_directory = args.guppy_logs_output_directory

    # set taiyaki args
    args.chunk_size = 1000
    args.chunk_overlap = 100
    args.max_concurrent_chunks = 200
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            'Guppy logs output directory exists. Potentially overwriting ' +
            'guppy logs.')
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, 1) as model_info:
        if model_info.is_cat_mod:
            can_bs = [
                can_b for mod_b, _ in model_info.mod_long_names
                for can_b, can_mod_bs in model_info.can_base_mods.items()
                if mod_b in can_mod_bs
            ]
            LOGGER.info(
                ('Model contains canonical alphabet {} and modified ' +
                 'bases {}.').format(
                     model_info.can_alphabet,
                     '; '.join('{}={} (alt to {})'.format(mod_b, mln, can_b)
                               for (mod_b, mln), can_b in zip(
                                   model_info.mod_long_names, can_bs))))
        else:
            LOGGER.info('Model contains canonical alphabet {}.'.format(
                model_info.can_alphabet))
def _main(args):
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            'Guppy logs output directory exists. Potentially overwriting ' +
            'guppy logs.')
    logging.init_logger(args.log_directory)
    # set args that are not relevant to alphabet
    args.devices = None

    # set guppy args
    args.guppy_server_port = None
    args.guppy_timeout = mh.DEFAULT_GUPPY_TIMEOUT
    args.output_directory = args.guppy_logs_output_directory

    # set taiyaki args
    args.chunk_size = 1000
    args.chunk_overlap = 100
    args.max_concurrent_chunks = 200
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, 1) as model_info:
        LOGGER.info(model_info.get_alphabet_str())