def create_aggregated_bin_ht(model_id: str) -> hl.Table: """ Aggregates variants into bins, grouped by `bin_id` (rank, bi-allelic, etc.), contig, and `snv`, `bi_allelic`, and `singleton` status, using previously annotated bin information. For each bin, aggregates statistics needed for evaluation plots. :param str model_id: Which variant QC model (RF or VQSR model ID) to group :return: Table of aggregate statistics by bin """ ht = get_score_bins(model_id, aggregated=False).ht() # Count variants for ranking count_expr = { x: hl.agg.filter( hl.is_defined(ht[x]), hl.agg.counter( hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv", "indel")), ) for x in ht.row if x.endswith("bin") } bin_variant_counts = ht.aggregate(hl.struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(bin_variant_counts)}") ht = ht.annotate_globals(bin_variant_counts=bin_variant_counts) # Load ClinVar pathogenic data clinvar_pathogenic_ht = filter_to_clinvar_pathogenic(clinvar.ht()) ht = ht.annotate(clinvar_path=hl.is_defined(clinvar_pathogenic_ht[ht.key])) trio_stats_ht = fam_stats.ht() logger.info(f"Creating grouped bin table...") grouped_binned_ht = compute_grouped_binned_ht( ht, checkpoint_path=get_checkpoint_path(f"grouped_bin_{model_id}"), ) logger.info(f"Aggregating grouped bin table...") parent_ht = grouped_binned_ht._parent agg_ht = grouped_binned_ht.aggregate( n_clinvar_path=hl.agg.count_where(parent_ht.clinvar_path), **score_bin_agg(grouped_binned_ht, fam_stats_ht=trio_stats_ht), ) return agg_ht
def create_grouped_bin_ht(model_id: str, overwrite: bool = False) -> None: """ Creates binned data from a quantile bin annotated Table grouped by bin_id (rank, bi-allelic, etc.), contig, snv, bi_allelic and singleton containing the information needed for evaluation plots. :param str model_id: Which data/run hash is being created :param bool overwrite: Should output files be overwritten if present :return: None :rtype: None """ ht = get_score_quantile_bins(model_id, aggregated=False).ht() # Count variants for ranking count_expr = { x: hl.agg.filter( hl.is_defined(ht[x]), hl.agg.counter( hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv", "indel")), ) for x in ht.row if x.endswith("bin") } bin_variant_counts = ht.aggregate(hl.struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(bin_variant_counts)}") ht = ht.annotate_globals(bin_variant_counts=bin_variant_counts) trio_stats_ht = fam_stats.ht() logger.info(f"Creating grouped bin table...") grouped_binned_ht = compute_grouped_binned_ht( ht, checkpoint_path=get_checkpoint_path(f"grouped_bin_{model_id}"), ) logger.info(f"Aggregating grouped bin table...") agg_ht = grouped_binned_ht.aggregate( **score_bin_agg(grouped_binned_ht, fam_stats_ht=trio_stats_ht)) agg_ht.write( get_score_quantile_bins(model_id, aggregated=True).path, overwrite=overwrite, )