Example #1
0
def main(args):
    hl.init(default_reference='GRCh38', log='/qc_annotations.log')

    if args.compute_info:
        compute_info().write(get_info(split=False).path,
                             overwrite=args.overwrite)

    if args.split_info:
        split_info().write(get_info(split=True).path, overwrite=args.overwrite)

    if args.export_info_vcf:
        info_ht = get_info(split=False).ht()
        hl.export_vcf(ht_to_vcf_mt(info_ht), info_vcf_path)

    # if args.generate_ac: # TODO: compute AC and qc_AC as part of compute_info
    # mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True)
    # mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    #
    # ht = generate_ac(mt, ).checkpoint('gs://gnomad-tmp/v3_ac_tmp.ht', overwrite=args.overwrite, _read_if_exists=not args.overwrite)
    # ht.repartition(10000, shuffle=False).write(ac_ht_path, overwrite=args.overwrite)

    if args.generate_fam_stats:
        mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True)
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
        fam_stats_ht = generate_fam_stats(mt, trios.path)
        fam_stats_ht = fam_stats_ht.checkpoint(
            'gs://gnomad-tmp/v3_fam_stats_tmp.ht',
            overwrite=args.overwrite,
            _read_if_exists=not args.overwrite)
        fam_stats_ht = fam_stats_ht.repartition(10000, shuffle=False)
        fam_stats_ht.write(fam_stats.path, overwrite=args.overwrite)

    if args.export_transmitted_singletons_vcf:
        export_transmitted_singletons_vcf()

    if args.vep:
        run_vep().write(vep.path, overwrite=args.overwrite)
Example #2
0
def create_bin_ht(model_id: str, n_bins: int) -> hl.Table:
    """
    Creates a table with bin annotations added for a RF or VQSR run and writes it to its correct location in annotations.

    :param model_id: Which variant QC model (RF or VQSR model ID) to annotate with bin
    :param n_bins: Number of bins to bin the data into
    :return: Table with bin annotations
    """
    logger.info(f"Annotating {model_id} HT with bins using {n_bins} bins")
    info_ht = get_info(split=True).ht()
    if model_id.startswith("vqsr"):
        rf_ht = get_rf_annotations().ht()
        ht = get_vqsr_filters(model_id, split=True).ht()

        ht = ht.annotate(
            **rf_ht[ht.key],
            info=info_ht[ht.key].info,
            score=ht.info.AS_VQSLOD,
            positive_train_site=ht.info.POSITIVE_TRAIN_SITE,
            negative_train_site=ht.info.NEGATIVE_TRAIN_SITE,
            AS_culprit=ht.info.AS_culprit,
        )

        # Remove all samples with an undefined ac_raw Because ac_raw was calculated on the high quality samples only
        # and VQSR was run before sample filtering
        ht = ht.filter(hl.is_defined(ht.ac_raw))

    else:
        ht = get_rf_result(model_id=model_id).ht()
        ht = ht.annotate(
            info=info_ht[ht.key].info,
            positive_train_site=ht.tp,
            negative_train_site=ht.fp,
            score=ht.rf_probability["TP"],
        )

    ht = ht.filter(~info_ht[ht.key].AS_lowqual
                   & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus]))
    ht_non_lcr = filter_low_conf_regions(
        ht,
        filter_lcr=True,
        # TODO: Uncomment when we have decoy path
        filter_decoy=False,  # True,
        filter_segdup=True,
    )
    ht = ht.annotate(non_lcr=hl.is_defined(ht_non_lcr[ht.key]))
    bin_ht = create_binned_ht(ht, n_bins, add_substrat={"non_lcr": ht.non_lcr})

    return bin_ht
Example #3
0
def split_info() -> hl.Table:
    """
    Generates an info table that splits multi-allelic sites from
    the multi-allelic info table.

    :return: Info table with split multi-allelics
    :rtype: Table
    """
    info_ht = get_info(split=False).ht()

    # Create split version
    info_ht = hl.split_multi(info_ht)

    # Index AS annotations
    info_ht = info_ht.annotate(info=info_ht.info.annotate(
        **{
            f: info_ht.info[f][info_ht.a_index - 1]
            for f in info_ht.info if f.startswith("AC") or (
                f.startswith("AS_") and not f == 'AS_SB_TABLE')
        },
        AS_SB_TABLE=info_ht.info.AS_SB_TABLE[0].extend(
            info_ht.info.AS_SB_TABLE[info_ht.a_index])),
                               lowqual=info_ht.lowqual[info_ht.a_index - 1])
    return info_ht
Example #4
0
def main(args):
    hl.init(log="/variant_qc_evaluation.log")

    if args.create_bin_ht:
        create_bin_ht(
            args.model_id,
            args.n_bins,
        ).write(
            get_score_bins(args.model_id, aggregated=False).path,
            overwrite=args.overwrite,
        )

    if args.run_sanity_checks:
        ht = get_score_bins(args.model_id, aggregated=False).ht()
        logger.info("Running sanity checks...")
        print(
            ht.aggregate(
                hl.struct(
                    was_biallelic=hl.agg.counter(~ht.was_split),
                    has_biallelic_rank=hl.agg.counter(
                        hl.is_defined(ht.biallelic_bin)),
                    was_singleton=hl.agg.counter(ht.singleton),
                    has_singleton_rank=hl.agg.counter(
                        hl.is_defined(ht.singleton_bin)),
                    was_biallelic_singleton=hl.agg.counter(ht.singleton
                                                           & ~ht.was_split),
                    has_biallelic_singleton_rank=hl.agg.counter(
                        hl.is_defined(ht.biallelic_singleton_bin)),
                )))

    if args.create_aggregated_bin_ht:
        logger.warning(
            "Use only workers, it typically crashes with preemptibles")
        create_aggregated_bin_ht(args.model_id).write(
            get_score_bins(args.model_id, aggregated=True).path,
            overwrite=args.overwrite,
        )

    if args.extract_truth_samples:
        logger.info(f"Extracting truth samples from MT...")
        mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True,
                              remove_hard_filtered_samples=False)

        mt = mt.filter_cols(
            hl.literal([v["s"]
                        for k, v in TRUTH_SAMPLES.items()]).contains(mt.s))
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

        # Checkpoint to prevent needing to go through the large table a second time
        mt = mt.checkpoint(
            get_checkpoint_path("truth_samples", mt=True),
            overwrite=args.overwrite,
        )

        for truth_sample in TRUTH_SAMPLES:
            truth_sample_mt = mt.filter_cols(
                mt.s == TRUTH_SAMPLES[truth_sample]["s"])
            # Filter to variants in truth data
            truth_sample_mt = truth_sample_mt.filter_rows(
                hl.agg.any(truth_sample_mt.GT.is_non_ref()))
            truth_sample_mt.naive_coalesce(args.n_partitions).write(
                get_callset_truth_data(truth_sample).path,
                overwrite=args.overwrite,
            )

    if args.merge_with_truth_data:
        for truth_sample in TRUTH_SAMPLES:
            logger.info(
                f"Creating a merged table with callset truth sample and truth data for {truth_sample}..."
            )

            # Load truth data
            mt = get_callset_truth_data(truth_sample).mt()
            truth_hc_intervals = TRUTH_SAMPLES[truth_sample][
                "hc_intervals"].ht()
            truth_mt = TRUTH_SAMPLES[truth_sample]["truth_mt"].mt()
            truth_mt = truth_mt.key_cols_by(
                s=hl.str(TRUTH_SAMPLES[truth_sample]["s"]))

            # Remove low quality sites
            info_ht = get_info(split=True).ht()
            mt = mt.filter_rows(~info_ht[mt.row_key].AS_lowqual)

            ht = create_truth_sample_ht(mt, truth_mt, truth_hc_intervals)
            ht.write(
                get_callset_truth_data(truth_sample, mt=False).path,
                overwrite=args.overwrite,
            )

    if args.bin_truth_sample_concordance:
        for truth_sample in TRUTH_SAMPLES:
            logger.info(
                f"Creating binned concordance table for {truth_sample} for model {args.model_id}"
            )
            ht = get_callset_truth_data(truth_sample, mt=False).ht()

            info_ht = get_info(split=True).ht()
            ht = ht.filter(
                ~info_ht[ht.key].AS_lowqual
                & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus]))

            logger.info("Filtering out low confidence regions and segdups...")
            ht = filter_low_conf_regions(
                ht,
                filter_lcr=True,
                # TODO: Uncomment when we have decoy path
                filter_decoy=False,  # True,
                filter_segdup=True,
            )

            logger.info(
                "Loading HT containing RF or VQSR scores annotated with a bin based on the rank of score..."
            )
            metric_ht = get_score_bins(args.model_id, aggregated=False).ht()
            ht = ht.filter(hl.is_defined(metric_ht[ht.key]))

            ht = ht.annotate(score=metric_ht[ht.key].score)

            ht = compute_binned_truth_sample_concordance(
                ht, metric_ht, args.n_bins)
            ht.write(
                get_binned_concordance(args.model_id, truth_sample).path,
                overwrite=args.overwrite,
            )