Example #1
0
def create_aggregated_bin_ht(model_id: str) -> hl.Table:
    """
    Aggregates variants into bins, grouped by `bin_id` (rank, bi-allelic, etc.), contig, and `snv`, `bi_allelic`,
    and `singleton` status, using previously annotated bin information.

    For each bin, aggregates statistics needed for evaluation plots.

    :param str model_id: Which variant QC model (RF or VQSR model ID) to group
    :return: Table of aggregate statistics by bin
    """

    ht = get_score_bins(model_id, aggregated=False).ht()

    # Count variants for ranking
    count_expr = {
        x: hl.agg.filter(
            hl.is_defined(ht[x]),
            hl.agg.counter(
                hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv",
                        "indel")),
        )
        for x in ht.row if x.endswith("bin")
    }
    bin_variant_counts = ht.aggregate(hl.struct(**count_expr))
    logger.info(
        f"Found the following variant counts:\n {pformat(bin_variant_counts)}")
    ht = ht.annotate_globals(bin_variant_counts=bin_variant_counts)

    # Load ClinVar pathogenic data
    clinvar_pathogenic_ht = filter_to_clinvar_pathogenic(clinvar.ht())
    ht = ht.annotate(clinvar_path=hl.is_defined(clinvar_pathogenic_ht[ht.key]))
    trio_stats_ht = fam_stats.ht()

    logger.info(f"Creating grouped bin table...")
    grouped_binned_ht = compute_grouped_binned_ht(
        ht,
        checkpoint_path=get_checkpoint_path(f"grouped_bin_{model_id}"),
    )

    logger.info(f"Aggregating grouped bin table...")
    parent_ht = grouped_binned_ht._parent
    agg_ht = grouped_binned_ht.aggregate(
        n_clinvar_path=hl.agg.count_where(parent_ht.clinvar_path),
        **score_bin_agg(grouped_binned_ht, fam_stats_ht=trio_stats_ht),
    )

    return agg_ht
def create_grouped_bin_ht(model_id: str, overwrite: bool = False) -> None:
    """
    Creates binned data from a quantile bin annotated Table grouped by bin_id (rank, bi-allelic, etc.), contig, snv,
    bi_allelic and singleton containing the information needed for evaluation plots.
    :param str model_id: Which data/run hash is being created
    :param bool overwrite: Should output files be overwritten if present
    :return: None
    :rtype: None
    """

    ht = get_score_quantile_bins(model_id, aggregated=False).ht()

    # Count variants for ranking
    count_expr = {
        x: hl.agg.filter(
            hl.is_defined(ht[x]),
            hl.agg.counter(
                hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv",
                        "indel")),
        )
        for x in ht.row if x.endswith("bin")
    }
    bin_variant_counts = ht.aggregate(hl.struct(**count_expr))
    logger.info(
        f"Found the following variant counts:\n {pformat(bin_variant_counts)}")
    ht = ht.annotate_globals(bin_variant_counts=bin_variant_counts)

    trio_stats_ht = fam_stats.ht()

    logger.info(f"Creating grouped bin table...")
    grouped_binned_ht = compute_grouped_binned_ht(
        ht,
        checkpoint_path=get_checkpoint_path(f"grouped_bin_{model_id}"),
    )

    logger.info(f"Aggregating grouped bin table...")
    agg_ht = grouped_binned_ht.aggregate(
        **score_bin_agg(grouped_binned_ht, fam_stats_ht=trio_stats_ht))

    agg_ht.write(
        get_score_quantile_bins(model_id, aggregated=True).path,
        overwrite=overwrite,
    )