Example #1
0
def test_annotate_with_probe_mapping_clustered_snps_and_indels():
    vcf_ref_fa = os.path.join(data_dir, "clustered_snp_indel.ref.fa")
    vcf_in = os.path.join(data_dir, "clustered_snp_indel.in.vcf")
    truth_ref_fa = os.path.join(data_dir, "clustered_snp_indel.truth.fa")
    truth_ref_revcomp_fa = os.path.join(
        data_dir, "clustered_snp_indel.truth.revcomp.fa")
    tmp_vcf = "tmp.probe_mapping.clustered_snp_indel.vcf"
    tmp_vcf_revcomp = f"{tmp_vcf}.revcomp"
    tmp_map = "tmp.probe_mapping.clustered_snp_indel.map"
    clean_files((tmp_vcf, tmp_vcf_revcomp, tmp_map))
    probe_mapping.annotate_vcf_with_probe_mapping(
        vcf_in,
        vcf_ref_fa,
        truth_ref_fa,
        100,
        tmp_vcf,
        map_outfile=tmp_map,
    )
    probe_mapping.annotate_vcf_with_probe_mapping(
        vcf_in,
        vcf_ref_fa,
        truth_ref_revcomp_fa,
        100,
        tmp_vcf_revcomp,
        map_outfile=tmp_map,
    )
    expect_vcf = os.path.join(data_dir, "clustered_snp_indel.expect.vcf")
    assert filecmp.cmp(tmp_vcf, expect_vcf, shallow=False)
    assert filecmp.cmp(tmp_vcf_revcomp, expect_vcf, shallow=False)
    clean_files((tmp_vcf, tmp_vcf_revcomp, tmp_map))
Example #2
0
def get_recall(
    ref_fasta,
    vcf_to_test,
    outdir,
    flank_length,
    truth_fasta=None,
    truth_vcf=None,
    debug=False,
    truth_mask=None,
    max_ref_len=None,
):
    os.mkdir(outdir)

    if truth_vcf is None:
        assert truth_fasta is not None
        # Make truth VCF. This only depends on ref_fasta and truth_fasta, not
        # on VCF to test. In particular, is independent of whether or not
        # were using all records in vcf_to_test, or PASS records only. This means
        # only need to make one truth VCF, which can be used for both cases.
        truth_outdir = os.path.join(outdir, "truth_vcf")
        truth_vcf = truth_variant_finding.make_truth_vcf(
            ref_fasta,
            truth_fasta,
            truth_outdir,
            flank_length,
            debug=debug,
            truth_mask=truth_mask,
            max_ref_len=max_ref_len,
        )
    else:
        assert truth_fasta is None

    vcfs_out = {}
    for all_or_filt in "ALL", "FILT":
        run_outdir = os.path.join(outdir, all_or_filt)
        os.mkdir(run_outdir)
        mutated_ref_fasta = os.path.join(run_outdir, "00.ref_with_mutations_added.fa")
        apply_variants_to_genome(
            ref_fasta, vcf_to_test, mutated_ref_fasta, pass_only=all_or_filt == "FILT"
        )

        # For each record in the truth VCF, make a probe and map to the mutated genome
        vcfs_out[all_or_filt] = os.path.join(
            run_outdir, "02.truth.probe_mapped_to_mutated_genome.vcf"
        )
        map_outfile = (
            os.path.join(run_outdir, "02.probe_map_debug.txt") if debug else None
        )
        probe_mapping.annotate_vcf_with_probe_mapping(
            truth_vcf,
            ref_fasta,
            mutated_ref_fasta,
            flank_length,
            vcfs_out[all_or_filt],
            map_outfile=map_outfile,
        )
    return vcfs_out["ALL"], vcfs_out["FILT"]
Example #3
0
def test_annotate_vcf_with_probe_mapping():
    # This is an end-to-end test of running annotate_vcf_with_probe_mapping().
    # Input files are made by the script tests/data/probe_mapping/make_test_data.py.
    # It makes a VCF file + matching ref FASTA, and a truth reference FASTA.
    #
    # The aim was to make this as comprehensive as reasonably possible.
    # Tests calling TPs correctly, and also calling FPs correctly - particularly
    # in the positions flanking the true variants in case of off-by-one errors
    # or the minimap2/mappy mapping doing unexpected things.
    #
    # Also, test reverse-complementing the truth genome results in exactly the
    # same output. Important to test because probes then all map to the reverse
    # strand, which is a potential source of bugs.
    # There is one variant that is slightly different because of how minimap
    # aligns the ref to the probe. Can't do anything about this, it's just
    # how alignemnts work.
    vcf_ref_fa = os.path.join(data_dir,
                              "annotate_vcf_with_probe_mapping.ref.fa")
    vcf_in = os.path.join(data_dir, "annotate_vcf_with_probe_mapping.in.vcf")
    truth_ref_fa = os.path.join(data_dir,
                                "annotate_vcf_with_probe_mapping.truth.fa")
    truth_ref_revcomp_fa = os.path.join(
        data_dir, "annotate_vcf_with_probe_mapping.truth.revcomp.fa")
    tmp_vcf = "tmp.probe_mapping.annotate_vcf_with_probe_mapping.vcf"
    tmp_vcf_revcomp = f"{tmp_vcf}.revcomp"
    tmp_map = "tmp.probe_mapping.annotate_vcf_with_probe_mapping.map"
    clean_files((tmp_vcf, tmp_vcf_revcomp, tmp_map))
    truth_mask = {"truth": {80, 81, 82}}
    probe_mapping.annotate_vcf_with_probe_mapping(
        vcf_in,
        vcf_ref_fa,
        truth_ref_fa,
        100,
        tmp_vcf,
        map_outfile=tmp_map,
        use_fail_conflict=True,
        truth_mask=truth_mask,
    )
    probe_mapping.annotate_vcf_with_probe_mapping(
        vcf_in,
        vcf_ref_fa,
        truth_ref_revcomp_fa,
        100,
        tmp_vcf_revcomp,
        use_fail_conflict=True,
        truth_mask=truth_mask,
    )
    expect_vcf = os.path.join(data_dir,
                              "annotate_vcf_with_probe_mapping.expect.vcf")
    expect_rev_vcf = os.path.join(
        data_dir, "annotate_vcf_with_probe_mapping.expect.rev.vcf")
    assert filecmp.cmp(tmp_vcf, expect_vcf, shallow=False)
    assert filecmp.cmp(tmp_vcf_revcomp, expect_rev_vcf, shallow=False)
    clean_files((tmp_vcf, tmp_vcf_revcomp, tmp_map))
Example #4
0
def make_truth_vcf(
    ref_fasta,
    truth_fasta,
    outdir,
    flank_length,
    debug=False,
    truth_mask=None,
    max_ref_len=None,
    snps_only=False,
    output_probes=False,
    detailed_VCF=False
):
    _check_dependencies_in_path()
    os.mkdir(outdir)
    minimap2_vcf = os.path.join(outdir, "00.minimap2.vcf")
    dnadiff_vcf = os.path.join(outdir, "00.dnadiff.vcf")
    merged_vcf = os.path.join(outdir, "01.merged.vcf")
    if debug:
        map_debug_file = os.path.join(outdir, "02.debug.map")
    else:
        map_debug_file = None
    probe_mapped_vcf = os.path.join(outdir, "02.merged_and_probe_mapped.vcf")
    probe_filtered_vcf = os.path.join(outdir, "03.probe_filtered.vcf")
    truth_vcf = os.path.join(outdir, "04.truth.vcf")

    dnadiff.make_truth_vcf(ref_fasta, truth_fasta, dnadiff_vcf, snps_only=snps_only, debug=debug)
    _truth_using_minimap2_paftools(ref_fasta, truth_fasta, minimap2_vcf, snps_only=snps_only)
    to_merge = [dnadiff_vcf, minimap2_vcf]

    if snps_only:
        _deduplicate_vcf_files_for_probe_mapping(to_merge, ref_fasta, merged_vcf)
    else:
        _merge_vcf_files_for_probe_mapping(to_merge, ref_fasta, merged_vcf)
    logging.info(f"Made merged VCF file {merged_vcf}")
    logging.info(f"Probe mapping to remove incorrect calls")
    probe_mapping.annotate_vcf_with_probe_mapping(
        merged_vcf,
        ref_fasta,
        truth_fasta,
        flank_length,
        probe_mapped_vcf,
        map_outfile=map_debug_file,
        truth_mask=truth_mask,
        output_probes=output_probes
    )
    _filter_fps_and_long_vars_from_probe_mapped_vcf(
        probe_mapped_vcf, probe_filtered_vcf, max_ref_len, detailed_VCF=detailed_VCF
    )
    logging.info(f"Made filtered VCF file {probe_filtered_vcf}")
    logging.info(f"Using bcftools to normalise and remove duplicates")
    _bcftools_norm(ref_fasta, probe_filtered_vcf, truth_vcf)
    logging.info(f"Finished making truth VCF file {truth_vcf}")
    return truth_vcf
Example #5
0
def test_annotate_vcf_with_probe_mapping():
    # This is an end-to-end test of running annotate_vcf_with_probe_mapping().
    # Input files are made by the script tests/data/probe_mapping/make_test_data.py.
    # It makes a VCF file + matching ref FASTA, and a truth reference FASTA.
    #
    # Also, test reverse-complementing the truth genome results in exactly the
    # same output. Important to test because probes then all map to the reverse
    # strand, which is a potential source of bugs.
    vcf_ref_fa = os.path.join(data_dir,
                              "annotate_vcf_with_probe_mapping.ref.fa")
    vcf_in = os.path.join(data_dir, "annotate_vcf_with_probe_mapping.in.vcf")
    truth_ref_fa = os.path.join(data_dir,
                                "annotate_vcf_with_probe_mapping.truth.fa")
    truth_ref_revcomp_fa = os.path.join(
        data_dir, "annotate_vcf_with_probe_mapping.truth.revcomp.fa")
    tmp_vcf = "tmp.probe_mapping.annotate_vcf_with_probe_mapping.vcf"
    tmp_vcf_revcomp = f"{tmp_vcf}.revcomp"
    tmp_map = "tmp.probe_mapping.annotate_vcf_with_probe_mapping.map"
    clean_files((tmp_vcf, tmp_vcf_revcomp, tmp_map))
    truth_mask = {"truth": {80, 81, 82}}
    probe_mapping.annotate_vcf_with_probe_mapping(
        vcf_in,
        vcf_ref_fa,
        truth_ref_fa,
        100,
        tmp_vcf,
        map_outfile=tmp_map,
        use_fail_conflict=True,
        truth_mask=truth_mask,
    )
    probe_mapping.annotate_vcf_with_probe_mapping(
        vcf_in,
        vcf_ref_fa,
        truth_ref_revcomp_fa,
        100,
        tmp_vcf_revcomp,
        use_fail_conflict=True,
        truth_mask=truth_mask,
    )
    expect_vcf = os.path.join(data_dir,
                              "annotate_vcf_with_probe_mapping.expect.vcf")
    assert filecmp.cmp(tmp_vcf, expect_vcf, shallow=False)
    assert filecmp.cmp(tmp_vcf_revcomp, expect_vcf, shallow=False)
    clean_files((tmp_vcf, tmp_vcf_revcomp, tmp_map))
Example #6
0
def get_recall(
    ref_fasta,
    vcf_to_test,
    outdir,
    flank_length,
    truth_fasta=None,
    truth_vcf=None,
    debug=False,
    truth_mask=None,
    max_ref_len=None,
):
    os.mkdir(outdir)

    if truth_vcf is None:
        assert truth_fasta is not None
        truth_outdir = os.path.join(outdir, "truth_vcf")
        truth_vcf = truth_variant_finding.make_truth_vcf(
            ref_fasta,
            truth_fasta,
            truth_outdir,
            flank_length,
            debug=debug,
            truth_mask=truth_mask,
            max_ref_len=max_ref_len,
        )
    else:
        assert truth_fasta is None

    mutated_ref_fasta = os.path.join(outdir, "ref_with_mutations_added.fa")
    apply_variants_to_genome(ref_fasta, vcf_to_test, mutated_ref_fasta)

    vcf_out = os.path.join(outdir, "recall.vcf")
    map_outfile = os.path.join(outdir, "probe_map_debug.txt") if debug else None
    probe_mapping.annotate_vcf_with_probe_mapping(
        truth_vcf,
        ref_fasta,
        mutated_ref_fasta,
        flank_length,
        vcf_out,
        map_outfile=map_outfile,
    )
    return vcf_out
Example #7
0
def evaluate_vcf(
    vcf_to_eval,
    vcf_ref_fasta,
    truth_ref_fasta,
    flank_length,
    outdir,
    truth_vcf=None,
    debug=False,
    force=False,
    ref_mask_bed_file=None,
    truth_mask_bed_file=None,
    discard_ref_calls=True,
    max_recall_ref_len=None,
):
    if force:
        subprocess.check_output(f"rm -rf {outdir}", shell=True)
    os.mkdir(outdir)

    # Mask if needed
    if ref_mask_bed_file is not None:
        masked_vcf = os.path.join(outdir, "variants_to_eval.masked.vcf")
        utils.mask_vcf_file(vcf_to_eval, ref_mask_bed_file, masked_vcf)
        vcf_to_eval = masked_vcf

    # Make VCF annotated with TP/FP for precision
    vcf_for_precision = os.path.join(outdir, "precision.vcf")
    if debug:
        map_outfile = f"{vcf_for_precision}.debug.map"
    else:
        map_outfile = None

    if truth_mask_bed_file is None:
        truth_mask = None
    else:
        truth_mask = utils.load_mask_bed_file(truth_mask_bed_file)

    probe_mapping.annotate_vcf_with_probe_mapping(
        vcf_to_eval,
        vcf_ref_fasta,
        truth_ref_fasta,
        flank_length,
        vcf_for_precision,
        map_outfile=map_outfile,
        use_ref_calls=not discard_ref_calls,
        truth_mask=truth_mask,
    )

    recall_dir = os.path.join(outdir, "recall")
    vcf_for_recall_all, vcf_for_recall_filtered = recall.get_recall(
        vcf_ref_fasta,
        vcf_for_precision,
        recall_dir,
        flank_length,
        debug=debug,
        truth_fasta=truth_ref_fasta if truth_vcf is None else None,
        truth_vcf=truth_vcf,
        truth_mask=truth_mask,
        max_ref_len=max_recall_ref_len,
    )
    if ref_mask_bed_file is not None:
        utils.mask_vcf_file(vcf_for_recall_all, ref_mask_bed_file,
                            f"{vcf_for_recall_all}.masked.vcf")
        vcf_for_recall_all = f"{vcf_for_recall_all}.masked.vcf"
        utils.mask_vcf_file(
            vcf_for_recall_filtered,
            ref_mask_bed_file,
            f"{vcf_for_recall_filtered}.masked.vcf",
        )
        vcf_for_recall_filtered = f"{vcf_for_recall_filtered}.masked.vcf"
        os.unlink(masked_vcf)

    # Gather stats and make plots
    per_record_recall_all = vcf_stats.per_record_stats_from_vcf_file(
        vcf_for_recall_all)
    per_record_recall_filtered = vcf_stats.per_record_stats_from_vcf_file(
        vcf_for_recall_filtered)
    recall_stats_all = vcf_stats.summary_stats_from_per_record_stats(
        per_record_recall_all, for_recall=True)
    recall_stats_filtered = vcf_stats.summary_stats_from_per_record_stats(
        per_record_recall_filtered, for_recall=True)
    recall_stats = {
        "ALL": recall_stats_all["ALL"],
        "FILT": recall_stats_filtered["ALL"],
    }

    per_record_precision = vcf_stats.per_record_stats_from_vcf_file(
        vcf_for_precision)
    precision_stats = vcf_stats.summary_stats_from_per_record_stats(
        per_record_precision)

    summary_stats = {"Recall": recall_stats, "Precision": precision_stats}
    _add_overall_precision_and_recall_to_summary_stats(summary_stats)

    summary_stats_json = os.path.join(outdir, "summary_stats.json")
    with open(summary_stats_json, "w") as f:
        json.dump(summary_stats, f, indent=2, sort_keys=True)
Example #8
0
def evaluate_vcf(
    vcf_to_eval,
    vcf_ref_fasta,
    truth_ref_fasta,
    flank_length,
    outdir,
    truth_vcf=None,
    debug=False,
    force=False,
    ref_mask_bed_file=None,
    truth_mask_bed_file=None,
    discard_ref_calls=True,
    max_recall_ref_len=None,
    filter_pass=None,
):
    if force:
        subprocess.check_output(f"rm -rf {outdir}", shell=True)
    os.mkdir(outdir)

    # Mask if needed
    if ref_mask_bed_file is None:
        vcf_to_filter = vcf_to_eval
    else:
        logging.info("Masking VCF...")
        masked_vcf = os.path.join(outdir, "variants_to_eval.masked.vcf")
        utils.mask_vcf_file(vcf_to_eval, ref_mask_bed_file, masked_vcf)
        vcf_to_filter = masked_vcf
        logging.info("Masked VCF")

    vcf_ref_seqs = utils.file_to_dict_of_seqs(vcf_ref_fasta)
    filtered_vcf = os.path.join(outdir, "variants_to_eval.filtered.vcf")
    excluded_vcf = os.path.join(outdir, "variants_to_eval.excluded.vcf")
    logging.info("Filtering VCF...")
    filtered_counts = _filter_vcf(
        vcf_to_filter,
        filtered_vcf,
        excluded_vcf,
        vcf_ref_seqs,
        filter_pass=filter_pass,
        keep_ref_calls=not discard_ref_calls,
    )
    logging.info("Filtering VCF done")

    vcf_for_precision = os.path.join(outdir, "precision.vcf")
    map_outfile = f"{vcf_for_precision}.debug.map" if debug else None
    if truth_mask_bed_file is None:
        truth_mask = None
    else:
        truth_mask = utils.load_mask_bed_file(truth_mask_bed_file)

    logging.info("Annotating VCF with TP/FP for precision...")
    probe_mapping.annotate_vcf_with_probe_mapping(
        filtered_vcf,
        vcf_ref_fasta,
        truth_ref_fasta,
        flank_length,
        vcf_for_precision,
        map_outfile=map_outfile,
        use_ref_calls=not discard_ref_calls,
        truth_mask=truth_mask,
    )
    logging.info("Annotatiing VCF with with TP/FP for precision done")

    logging.info("Calculating recall...")
    recall_dir = os.path.join(outdir, "recall")
    vcf_for_recall = recall.get_recall(
        vcf_ref_fasta,
        filtered_vcf,
        recall_dir,
        flank_length,
        debug=debug,
        truth_fasta=truth_ref_fasta if truth_vcf is None else None,
        truth_vcf=truth_vcf,
        truth_mask=truth_mask,
        max_ref_len=max_recall_ref_len,
    )
    if ref_mask_bed_file is not None:
        logging.info("Masking recall VCF...")
        utils.mask_vcf_file(
            vcf_for_recall, ref_mask_bed_file, f"{vcf_for_recall}.masked.vcf"
        )
        vcf_for_recall = f"{vcf_for_recall}.masked.vcf"
        logging.info("Masking recall VCF done")
    logging.info("Recall calculation done")

    # Gather stats and make plots
    logging.info("Gathering stats...")
    per_record_recall = vcf_stats.per_record_stats_from_vcf_file(vcf_for_recall)
    recall_stats = vcf_stats.summary_stats_from_per_record_stats(
        per_record_recall, for_recall=True
    )

    per_record_precision = vcf_stats.per_record_stats_from_vcf_file(vcf_for_precision)
    precision_stats = vcf_stats.summary_stats_from_per_record_stats(
        per_record_precision
    )

    summary_stats = {"Recall": recall_stats, "Precision": precision_stats}
    _add_overall_precision_and_recall_to_summary_stats(summary_stats)
    summary_stats["Excluded_record_counts"] = filtered_counts

    summary_stats_json = os.path.join(outdir, "summary_stats.json")
    with open(summary_stats_json, "w") as f:
        json.dump(summary_stats, f, indent=2, sort_keys=True)

    logging.info(f"Done. Results written to {summary_stats_json}")