def _main(args):
    logging.init_logger(log_fn=args.log_filename)
    if args.ground_truth_cov_min < 1:
        LOGGER.warning(
            "--ground-truth-cov-min must be 1 or greater. " + "Setting to 1."
        )
        args.ground_truth_cov_min = 1
    if args.nanopore_cov_min < 1:
        LOGGER.warning(
            "--nanopore-cov-min must be 1 or greater. " + "Setting to 1."
        )
        args.nanopore_cov_min = 1
    LOGGER.info("Checking for consistent contig names")
    mod_db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)
    check_matching_attrs(
        args.ground_truth_bed, args.strand_offset, mod_db_fn, args.mod_bases
    )

    LOGGER.info("Processing batches")
    process_all_batches(
        args.processes,
        args.batch_size,
        args.ground_truth_bed,
        args.out_low_coverage_sites,
        args.out_per_site_mod_thresholds,
        mod_db_fn,
        args.strand_offset,
        args.ground_truth_cov_min,
        args.nanopore_cov_min,
        args.mod_bases,
        args.valid_sites,
    )
def _main(args):
    logging.init_logger()
    pdf_fp = PdfPages(args.out_pdf)
    out_fp = (sys.stdout if args.out_filename is None else open(
        args.out_filename, 'w'))

    (samp1_cov, samp1_mod_cov, samp1_all_cov, samp2_cov, samp2_mod_cov,
     samp2_all_cov, valid_pos) = parse_inputs(args.sample1_bed_methyl_files,
                                              args.sample2_bed_methyl_files,
                                              args.strand_offset,
                                              args.sample_names,
                                              args.valid_positions, out_fp)
    (samp1_mod_pct, samp2_mod_pct, samp1_valid_cov,
     samp2_valid_cov) = compute_filt_mod_pct(samp1_cov, samp1_mod_cov,
                                             samp2_cov, samp2_mod_cov,
                                             valid_pos,
                                             args.coverage_threshold,
                                             args.sample_names, out_fp)
    plot_hm(samp1_mod_pct, samp2_mod_pct, args.heatmap_num_bins,
            args.sample_names, out_fp, pdf_fp)
    plot_cov(samp1_all_cov, samp2_all_cov, samp1_valid_cov, samp2_valid_cov,
             args.sample_names, pdf_fp)

    pdf_fp.close()
    if out_fp is not sys.stdout:
        out_fp.close()
Ejemplo n.º 3
0
def _main(args):
    logging.init_logger(log_fn=args.log_filename, quiet=args.quiet)
    # add required attributes for loading guppy, but not valid options for
    # this script.
    args.do_not_use_guppy_server = False
    args.output_directory = args.guppy_logs_output_directory
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            "Guppy logs output directory exists. Potentially overwriting "
            + "guppy logs."
        )
    args = add_trim_guppy_none(args)
    args.outputs = [mh.PR_MOD_NAME]
    # make edge_buffer >= context_bases to simplify processing
    if args.edge_buffer < args.mod_context_bases:
        LOGGER.warning(
            "[--edge-buffer] less than [--mod-context-bases]. Setting "
            + "[--edge-buffer] to value from [--mod-context-bases]"
        )
        args.edge_buffer = args.mod_context_bases

    LOGGER.info("Loading model.")
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, args.processes) as model_info:
        check_map_sig_alphabet(model_info, args.mapped_signal_file)
        motifs = parse_motifs(args.motif, model_info, args.modified_bases_set)
        can_labs, mod_labs = extract_label_conversions(model_info)
        can_post_indices = model_info.can_indices.astype(np.uintp)
        all_mod_llrs, all_can_llrs = compute_diff_scores(
            args.mapped_signal_file,
            model_info,
            args.mod_context_bases,
            args.edge_buffer,
            args.num_reads,
            motifs,
            can_labs,
            mod_labs,
            can_post_indices,
        )

    mod_summary = [
        (
            mod,
            len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0,
            len(all_can_llrs[mod]) if mod in all_can_llrs else 0,
        )
        for mod in set(all_mod_llrs).union(all_can_llrs)
    ]
    LOGGER.info(
        "Data summary:\n\tmod\tmod_N\tcan_N\n"
        + "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary)
    )
    output_mods_data(all_mod_llrs, all_can_llrs, args.out_filename)
def _main(args):
    logging.init_logger()
    LOGGER.info("Opening VCF files.")
    source_vars = pysam.VariantFile(args.diploid_called_variants)
    h1_vars = pysam.VariantFile(args.haplotype1_variants)
    h2_vars = pysam.VariantFile(args.haplotype2_variants)
    try:
        contigs0 = list(source_vars.header.contigs.keys())
        source_vars.fetch(contigs0[0])
        h1_vars.fetch(next(iter(h1_vars.header.contigs.keys())))
        h2_vars.fetch(next(iter(h2_vars.header.contigs.keys())))
    except ValueError:
        raise mh.MegaError(
            "Variant files must be indexed. Use bgzip and tabix.")

    LOGGER.info("Processing variants.")
    out_vars = open(args.out_vcf, "w")
    out_vars.write(
        HEADER.format("\n".join(
            (CONTIG_HEADER_LINE.format(ctg.name, ctg.length)
             for ctg in source_vars.header.contigs.values()))))
    bar = tqdm(
        total=len(contigs0),
        smoothing=0,
        unit=" contigs",
        dynamic_ncols=True,
        desc="Variant Processing",
        mininterval=0,
    )
    for contig in contigs0:
        for curr_s_rec, curr_h1_rec, curr_h2_rec in tqdm(
                iter_contig_vars(
                    get_contig_iter(source_vars, contig),
                    get_contig_iter(h1_vars, contig),
                    get_contig_iter(h2_vars, contig),
                    contig,
                    bar,
                    args.force_invalid_variant_processing,
                ),
                smoothing=0,
                unit=" variants",
                dynamic_ncols=True,
                leave=False,
                desc="{} Variants".format(contig),
        ):
            write_var(curr_s_rec, curr_h1_rec, curr_h2_rec, out_vars, contig)
        bar.update(1)

    out_vars.close()

    variants.index_variants(args.out_vcf)
def _main(args):
    logging.init_logger(quiet=args.quiet)

    if (args.ground_truth_data is None
            and args.control_megalodon_results_dir is None):
        LOGGER.error(
            "Must provide either --control-megalodon-results-dir or " +
            "--ground-truth-data")
        sys.exit()

    db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)
    if args.ground_truth_data is not None:
        LOGGER.info("Parsing ground truth data")
        gt_mod_pos, gt_can_pos = mh.parse_ground_truth_file(
            args.ground_truth_data, include_strand=args.strand_specific_sites)
        LOGGER.info(
            ("Loaded ground truth data with {} modified sites and {} " +
             "canonical sites.").format(len(gt_mod_pos), len(gt_can_pos)))
        LOGGER.info("Reading ground truth modified base statistics from " +
                    "database.")
        all_mod_llrs, all_can_llrs = mods.extract_stats_at_valid_sites(
            db_fn,
            [gt_mod_pos, gt_can_pos],
            include_strand=args.strand_specific_sites,
        )
    else:
        LOGGER.info("Reading ground truth modified base statistics from " +
                    "database")
        all_mod_llrs = mods.extract_all_stats(db_fn)
        LOGGER.info("Reading ground truth modified base statistics from " +
                    "canonical sample database")
        all_can_llrs = mods.extract_all_stats(
            mh.get_megalodon_fn(args.control_megalodon_results_dir,
                                mh.PR_MOD_NAME))

    mod_summary = [(
        mod,
        len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0,
        len(all_can_llrs[mod]) if mod in all_can_llrs else 0,
    ) for mod in set(all_mod_llrs).union(all_can_llrs)]
    LOGGER.info("Data summary:\n\tmod\tmod_N\tcan_N\n" +
                "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary))
    output_mods_data(
        all_mod_llrs,
        all_can_llrs,
        args.modified_bases_set,
        args.exclude_modified_bases,
        args.out_filename,
    )
def _main(args):
    logging.init_logger()
    mh.prep_out_fn(args.out_filename, args.overwrite)

    LOGGER.info("Processing {}".format(
        args.modified_base_calibration_files[-1]))
    calib_data = np.load(args.modified_base_calibration_files[-1])
    stratify_type = str(calib_data[calibration.MOD_STRAT_TYPE_TXT])
    num_calib_vals = np.int(calib_data[calibration.SMOOTH_NVALS_TXT])
    mod_calibs = {}
    for mod_base in calib_data[calibration.MOD_BASES_TXT]:
        LOGGER.info("\tUsing {} calibration".format(mod_base))
        mod_calibs[mod_base] = (
            calib_data[mod_base + calibration.LLR_RANGE_SUFFIX].copy(),
            calib_data[mod_base + calibration.CALIB_TABLE_SUFFIX].copy(),
        )
    for mod_calib_fn in args.modified_base_calibration_files[-2::-1]:
        LOGGER.info("Processing {}".format(mod_calib_fn))
        calib_data = np.load(mod_calib_fn)
        assert stratify_type == str(calib_data[calibration.MOD_STRAT_TYPE_TXT])
        assert num_calib_vals == np.int(
            calib_data[calibration.SMOOTH_NVALS_TXT])
        for mod_base in calib_data[calibration.MOD_BASES_TXT]:
            # overwrite calibration data with files passed earlier
            if mod_base in mod_calibs:
                LOGGER.info("\tOverwriting {} calibration".format(mod_base))
            else:
                LOGGER.info("\tUsing {} calibration".format(mod_base))
            mod_calibs[mod_base] = (
                calib_data[mod_base + calibration.LLR_RANGE_SUFFIX].copy(),
                calib_data[mod_base + calibration.CALIB_TABLE_SUFFIX].copy(),
            )

    save_kwargs = {}
    for mod_base, (mod_llr_range, mod_calib) in mod_calibs.items():
        save_kwargs[mod_base + calibration.LLR_RANGE_SUFFIX] = mod_llr_range
        save_kwargs[mod_base + calibration.CALIB_TABLE_SUFFIX] = mod_calib

    # save calibration table for reading into mod calibration table
    LOGGER.info("Saving calibrations to file.")
    mod_bases = list(mod_calibs.keys())
    np.savez(
        args.out_filename,
        stratify_type=stratify_type,
        smooth_nvals=num_calib_vals,
        mod_bases=mod_bases,
        **save_kwargs,
    )