Exemple #1
0
def create_bin_ht(model_id: str, n_bins: int) -> hl.Table:
    """
    Creates a table with bin annotations added for a RF or VQSR run and writes it to its correct location in annotations.

    :param model_id: Which variant QC model (RF or VQSR model ID) to annotate with bin
    :param n_bins: Number of bins to bin the data into
    :return: Table with bin annotations
    """
    logger.info(f"Annotating {model_id} HT with bins using {n_bins} bins")
    info_ht = get_info(split=True).ht()
    if model_id.startswith("vqsr"):
        rf_ht = get_rf_annotations().ht()
        ht = get_vqsr_filters(model_id, split=True).ht()

        ht = ht.annotate(
            **rf_ht[ht.key],
            info=info_ht[ht.key].info,
            score=ht.info.AS_VQSLOD,
            positive_train_site=ht.info.POSITIVE_TRAIN_SITE,
            negative_train_site=ht.info.NEGATIVE_TRAIN_SITE,
            AS_culprit=ht.info.AS_culprit,
        )

        # Remove all samples with an undefined ac_raw Because ac_raw was calculated on the high quality samples only
        # and VQSR was run before sample filtering
        ht = ht.filter(hl.is_defined(ht.ac_raw))

    else:
        ht = get_rf_result(model_id=model_id).ht()
        ht = ht.annotate(
            info=info_ht[ht.key].info,
            positive_train_site=ht.tp,
            negative_train_site=ht.fp,
            score=ht.rf_probability["TP"],
        )

    ht = ht.filter(~info_ht[ht.key].AS_lowqual
                   & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus]))
    ht_non_lcr = filter_low_conf_regions(
        ht,
        filter_lcr=True,
        # TODO: Uncomment when we have decoy path
        filter_decoy=False,  # True,
        filter_segdup=True,
    )
    ht = ht.annotate(non_lcr=hl.is_defined(ht_non_lcr[ht.key]))
    bin_ht = create_binned_ht(ht, n_bins, add_substrat={"non_lcr": ht.non_lcr})

    return bin_ht
def main(args):
    hl.init(default_reference='GRCh38')
    coverage_version = args.coverage_version if args.coverage_version else CURRENT_GENOME_COVERAGE_RELEASE
    logger = logging.getLogger("gnomad_qc.v3.load_data.compute_coverage")

    logger.warning(
        "Last time this was run (July 2020), this script required high-mem machines."
    )

    if args.compute_coverage_ht:

        print("Building reference context HT")
        ref_ht = get_reference_ht(
            hl.get_reference('GRCh38'),
            contigs=[f'chr{x}' for x in range(1, 23)] + ['chrX', 'chrY'],
            excluded_intervals=telomeres_and_centromeres.ht().interval.collect(
            ))
        ref_ht = ref_ht.checkpoint("gs://gnomad-tmp/ref_context.ht",
                                   overwrite=True)
        logger.info("Done building reference context HT")

        mt = get_gnomad_v3_mt()
        mt = mt.filter_cols(meta.ht()[mt.col_key].release)

        coverage_ht = compute_coverage_stats(mt, ref_ht)

        coverage_ht = coverage_ht.checkpoint(
            'gs://gnomad-tmp/gnomad.genomes_v3.coverage.summary.ht',
            overwrite=True)
        coverage_ht = coverage_ht.naive_coalesce(5000)

        coverage_ht.write(coverage('genomes').versions[coverage_version].path,
                          overwrite=args.overwrite)

    if args.export_coverage:
        ht = coverage('genomes').versions[coverage_version].ht()
        if 'count_array' in ht.row_value:  # Note that count_array isn't computed any more, so this is v3.0-specific
            ht = ht.drop('count_array')
        ht.export(coverage_tsv_path('genomes', coverage_version))
Exemple #3
0
def train_rf(
    ht: hl.Table,
    fp_to_tp: float = 1.0,
    num_trees: int = 500,
    max_depth: int = 5,
    no_transmitted_singletons: bool = False,
    no_inbreeding_coeff: bool = False,
    vqsr_training: bool = False,
    vqsr_model_id: str = False,
    filter_centromere_telomere: bool = False,
    test_intervals: Union[str, List[str]] = "chr20",
):
    """
    Train random forest model using `train_rf_model`

    :param ht: Table containing annotations needed for RF training, built with `create_rf_ht`
    :param fp_to_tp: Ratio of FPs to TPs for creating the RF model. If set to 0, all training examples are used.
    :param num_trees: Number of trees in the RF model.
    :param max_depth: Maxmimum tree depth in the RF model.
    :param no_transmitted_singletons: Do not use transmitted singletons for training.
    :param no_inbreeding_coeff: Do not use inbreeding coefficient as a feature for training.
    :param vqsr_training: Use VQSR training sites to train the RF.
    :param vqsr_model_id: VQSR model to use for vqsr_training. `vqsr_training` must be True for this parameter to be used.
    :param filter_centromere_telomere: Filter centromeres and telomeres before training.
    :param test_intervals: Specified interval(s) will be held out for testing and evaluation only. (default to "chr20")
    :return: `ht` annotated with training information and the RF model
    """
    features = FEATURES
    test_intervals = test_intervals
    if no_inbreeding_coeff:
        logger.info("Removing InbreedingCoeff from list of features...")
        features.remove("InbreedingCoeff")

    if vqsr_training:
        logger.info("Using VQSR training sites for RF training...")
        vqsr_ht = get_vqsr_filters(vqsr_model_id, split=True).ht()
        ht = ht.annotate(
            vqsr_POSITIVE_TRAIN_SITE=vqsr_ht[ht.key].info.POSITIVE_TRAIN_SITE,
            vqsr_NEGATIVE_TRAIN_SITE=vqsr_ht[ht.key].info.NEGATIVE_TRAIN_SITE,
        )
        tp_expr = ht.vqsr_POSITIVE_TRAIN_SITE
        fp_expr = ht.vqsr_NEGATIVE_TRAIN_SITE
    else:
        fp_expr = ht.fail_hard_filters
        tp_expr = ht.omni | ht.mills
        if not no_transmitted_singletons:
            tp_expr = tp_expr | ht.transmitted_singleton

    if test_intervals:
        if isinstance(test_intervals, str):
            test_intervals = [test_intervals]
        test_intervals = [
            hl.parse_locus_interval(x, reference_genome="GRCh38")
            for x in test_intervals
        ]

    ht = ht.annotate(tp=tp_expr, fp=fp_expr)

    if filter_centromere_telomere:
        logger.info("Filtering centromeres and telomeres from HT...")
        rf_ht = ht.filter(~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus]))
    else:
        rf_ht = ht

    rf_ht, rf_model = train_rf_model(
        rf_ht,
        rf_features=features,
        tp_expr=rf_ht.tp,
        fp_expr=rf_ht.fp,
        fp_to_tp=fp_to_tp,
        num_trees=num_trees,
        max_depth=max_depth,
        test_expr=hl.literal(test_intervals).any(
            lambda interval: interval.contains(rf_ht.locus)
        ),
    )

    logger.info("Joining original RF Table with training information")
    ht = ht.join(rf_ht, how="left")

    return ht, rf_model
Exemple #4
0
def main(args):
    hl.init(log="/variant_qc_evaluation.log")

    if args.create_bin_ht:
        create_bin_ht(
            args.model_id,
            args.n_bins,
        ).write(
            get_score_bins(args.model_id, aggregated=False).path,
            overwrite=args.overwrite,
        )

    if args.run_sanity_checks:
        ht = get_score_bins(args.model_id, aggregated=False).ht()
        logger.info("Running sanity checks...")
        print(
            ht.aggregate(
                hl.struct(
                    was_biallelic=hl.agg.counter(~ht.was_split),
                    has_biallelic_rank=hl.agg.counter(
                        hl.is_defined(ht.biallelic_bin)),
                    was_singleton=hl.agg.counter(ht.singleton),
                    has_singleton_rank=hl.agg.counter(
                        hl.is_defined(ht.singleton_bin)),
                    was_biallelic_singleton=hl.agg.counter(ht.singleton
                                                           & ~ht.was_split),
                    has_biallelic_singleton_rank=hl.agg.counter(
                        hl.is_defined(ht.biallelic_singleton_bin)),
                )))

    if args.create_aggregated_bin_ht:
        logger.warning(
            "Use only workers, it typically crashes with preemptibles")
        create_aggregated_bin_ht(args.model_id).write(
            get_score_bins(args.model_id, aggregated=True).path,
            overwrite=args.overwrite,
        )

    if args.extract_truth_samples:
        logger.info(f"Extracting truth samples from MT...")
        mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True,
                              remove_hard_filtered_samples=False)

        mt = mt.filter_cols(
            hl.literal([v["s"]
                        for k, v in TRUTH_SAMPLES.items()]).contains(mt.s))
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

        # Checkpoint to prevent needing to go through the large table a second time
        mt = mt.checkpoint(
            get_checkpoint_path("truth_samples", mt=True),
            overwrite=args.overwrite,
        )

        for truth_sample in TRUTH_SAMPLES:
            truth_sample_mt = mt.filter_cols(
                mt.s == TRUTH_SAMPLES[truth_sample]["s"])
            # Filter to variants in truth data
            truth_sample_mt = truth_sample_mt.filter_rows(
                hl.agg.any(truth_sample_mt.GT.is_non_ref()))
            truth_sample_mt.naive_coalesce(args.n_partitions).write(
                get_callset_truth_data(truth_sample).path,
                overwrite=args.overwrite,
            )

    if args.merge_with_truth_data:
        for truth_sample in TRUTH_SAMPLES:
            logger.info(
                f"Creating a merged table with callset truth sample and truth data for {truth_sample}..."
            )

            # Load truth data
            mt = get_callset_truth_data(truth_sample).mt()
            truth_hc_intervals = TRUTH_SAMPLES[truth_sample][
                "hc_intervals"].ht()
            truth_mt = TRUTH_SAMPLES[truth_sample]["truth_mt"].mt()
            truth_mt = truth_mt.key_cols_by(
                s=hl.str(TRUTH_SAMPLES[truth_sample]["s"]))

            # Remove low quality sites
            info_ht = get_info(split=True).ht()
            mt = mt.filter_rows(~info_ht[mt.row_key].AS_lowqual)

            ht = create_truth_sample_ht(mt, truth_mt, truth_hc_intervals)
            ht.write(
                get_callset_truth_data(truth_sample, mt=False).path,
                overwrite=args.overwrite,
            )

    if args.bin_truth_sample_concordance:
        for truth_sample in TRUTH_SAMPLES:
            logger.info(
                f"Creating binned concordance table for {truth_sample} for model {args.model_id}"
            )
            ht = get_callset_truth_data(truth_sample, mt=False).ht()

            info_ht = get_info(split=True).ht()
            ht = ht.filter(
                ~info_ht[ht.key].AS_lowqual
                & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus]))

            logger.info("Filtering out low confidence regions and segdups...")
            ht = filter_low_conf_regions(
                ht,
                filter_lcr=True,
                # TODO: Uncomment when we have decoy path
                filter_decoy=False,  # True,
                filter_segdup=True,
            )

            logger.info(
                "Loading HT containing RF or VQSR scores annotated with a bin based on the rank of score..."
            )
            metric_ht = get_score_bins(args.model_id, aggregated=False).ht()
            ht = ht.filter(hl.is_defined(metric_ht[ht.key]))

            ht = ht.annotate(score=metric_ht[ht.key].score)

            ht = compute_binned_truth_sample_concordance(
                ht, metric_ht, args.n_bins)
            ht.write(
                get_binned_concordance(args.model_id, truth_sample).path,
                overwrite=args.overwrite,
            )
def main(args):
    hl.init(log="/variant_qc_finalize.log")

    ht = get_score_bins(args.model_id, aggregated=False).ht()
    if args.filter_centromere_telomere:
        ht = ht.filter(
            ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus]))

    info_ht = get_info(split=True).ht()
    ht = ht.filter(~info_ht[ht.key].AS_lowqual)

    if args.model_id.startswith("vqsr_"):
        ht = ht.drop("info")

    freq_ht = get_freq().ht()
    ht = ht.annotate(InbreedingCoeff=freq_ht[ht.key].InbreedingCoeff)
    freq_idx = freq_ht[ht.key]
    aggregated_bin_path = get_score_bins(args.model_id, aggregated=True).path
    if not file_exists(aggregated_bin_path):
        sys.exit(
            f"Could not find binned HT for model: {args.model_id} ({aggregated_bin_path}). Please run create_ranked_scores.py for that hash."
        )
    aggregated_bin_ht = get_score_bins(args.model_id, aggregated=True).ht()

    ht = generate_final_filter_ht(
        ht,
        args.model_name,
        args.score_name,
        ac0_filter_expr=freq_idx.freq[0].AC == 0,
        ts_ac_filter_expr=freq_idx.freq[1].AC == 1,
        mono_allelic_flag_expr=(freq_idx.freq[1].AF == 1) |
        (freq_idx.freq[1].AF == 0),
        snp_bin_cutoff=args.snp_bin_cutoff,
        indel_bin_cutoff=args.indel_bin_cutoff,
        snp_score_cutoff=args.snp_score_cutoff,
        indel_score_cutoff=args.indel_score_cutoff,
        inbreeding_coeff_cutoff=args.inbreeding_coeff_threshold,
        aggregated_bin_ht=aggregated_bin_ht,
        bin_id="bin",
        vqsr_ht=get_vqsr_filters(args.vqsr_model_id, split=True).ht()
        if args.vqsr_model_id else None,
    )
    ht = ht.annotate_globals(
        filtering_model=ht.filtering_model.annotate(model_id=args.model_id, ))
    if args.model_id.startswith("vqsr_"):
        ht = ht.annotate_globals(filtering_model=ht.filtering_model.annotate(
            snv_training_variables=[
                "AS_QD",
                "AS_MQRankSum",
                "AS_ReadPosRankSum",
                "AS_FS",
                "AS_SOR",
                "AS_MQ",
            ],
            indel_training_variables=[
                "AS_QD",
                "AS_MQRankSum",
                "AS_ReadPosRankSum",
                "AS_FS",
                "AS_SOR",
            ],
        ))
    else:
        ht = ht.annotate_globals(filtering_model=ht.filtering_model.annotate(
            snv_training_variables=ht.features,
            indel_training_variables=ht.features,
        ))

    ht.write(final_filter.path, args.overwrite)

    final_filter_ht = final_filter.ht()
    final_filter_ht.summarize()