Ejemplo n.º 1
0
def compute_stratified_metrics_filter(ht: hl.Table,
                                      qc_metrics: List[str],
                                      strata: List[str] = None) -> hl.Table:
    """
    Compute median, MAD, and upper and lower thresholds for each metric used in pop- and platform-specific outlier filtering

    :param MatrixTable ht: HT containing relevant sample QC metric annotations
    :param list qc_metrics: list of metrics for which to compute the critical values for filtering outliers
    :param list of str strata: List of annotations used for stratification. These metrics should be discrete types!
    :return: Table grouped by pop and platform, with upper and lower threshold values computed for each sample QC metric
    :rtype: Table
    """
    def make_pop_filters_expr(ht: hl.Table,
                              qc_metrics: List[str]) -> hl.expr.SetExpression:
        return hl.set(
            hl.filter(lambda x: hl.is_defined(x), [
                hl.or_missing(ht[f'fail_{metric}'], metric)
                for metric in qc_metrics
            ]))

    ht = ht.select(*strata,
                   **ht.sample_qc.select(*qc_metrics)).key_by('s').persist()

    def get_metric_expr(ht, metric):
        metric_values = hl.agg.collect(ht[metric])
        metric_median = hl.median(metric_values)
        metric_mad = 1.4826 * hl.median(hl.abs(metric_values - metric_median))
        return hl.struct(median=metric_median,
                         mad=metric_mad,
                         upper=metric_median +
                         4 * metric_mad if metric != 'callrate' else 1,
                         lower=metric_median -
                         4 * metric_mad if metric != 'callrate' else 0.99)

    agg_expr = hl.struct(
        **{metric: get_metric_expr(ht, metric)
           for metric in qc_metrics})
    if strata:
        ht = ht.annotate_globals(metrics_stats=ht.aggregate(
            hl.agg.group_by(hl.tuple([ht[x] for x in strata]), agg_expr)))
    else:
        ht = ht.annotate_globals(metrics_stats={(): ht.aggregate(agg_expr)})

    strata_exp = hl.tuple([ht[x] for x in strata]) if strata else hl.tuple([])

    fail_exprs = {
        f'fail_{metric}':
        (ht[metric] >= ht.metrics_stats[strata_exp][metric].upper) |
        (ht[metric] <= ht.metrics_stats[strata_exp][metric].lower)
        for metric in qc_metrics
    }
    ht = ht.transmute(**fail_exprs)
    pop_platform_filters = make_pop_filters_expr(ht, qc_metrics)
    return ht.annotate(pop_platform_filters=pop_platform_filters)
Ejemplo n.º 2
0
def annotate_relatedness(
    relatedness_ht: hl.Table,
    first_degree_kin_thresholds: float = (0.1767767, 0.4),
    second_degree_kin_cutoff: float = 0.1,
    ibd0_0_max: float = 0.05,
) -> hl.Table:
    relatedness_ht = relatedness_ht.annotate(
        relationship=get_relationship_expr(
            kin_expr=relatedness_ht.kin,
            ibd0_expr=relatedness_ht.ibd0,
            ibd1_expr=relatedness_ht.ibd1,
            ibd2_expr=relatedness_ht.ibd2,
            first_degree_kin_thresholds=tuple(first_degree_kin_thresholds),
            second_degree_min_kin=second_degree_kin_cutoff,
            ibd0_0_max=ibd0_0_max,
        )
    )
    relatedness_ht = relatedness_ht.annotate_globals(
        min_individual_maf=0.01,
        min_emission_kinship=0.05,
        ibd0_0_max=ibd0_0_max,
        second_degree_kin_cutoff=second_degree_kin_cutoff,
        first_degree_kin_thresholds=tuple(first_degree_kin_thresholds),
    )
    return relatedness_ht
Ejemplo n.º 3
0
def generate_final_rf_ht(
    ht: hl.Table,
    snp_cutoff: Union[int, float],
    indel_cutoff: Union[int, float],
    inbreeding_coeff_cutoff: float = INBREEDING_COEFF_HARD_CUTOFF,
    determine_cutoff_from_bin: bool = False,
    aggregated_bin_ht: Optional[hl.Table] = None,
    bin_id: Optional[hl.expr.Int32Expression] = None,
) -> hl.Table:
    """
    Prepares finalized RF model given an RF result table from `rf.apply_rf_model` and cutoffs for filtering.
    If `determine_cutoff_from_bin` is True, `aggregated_bin_ht` must be supplied to determine the SNP and indel RF
    probabilities to use as cutoffs from an aggregated quantile bin Table like one created by
    `compute_grouped_binned_ht` in combination with `score_bin_agg`.
    :param ht: RF result table from `rf.apply_rf_model` to prepare as the final RF Table
    :param ac0_filter_expr: Expression that indicates if a variant should be filtered as allele count 0 (AC0)
    :param ts_ac_filter_expr: Expression in `ht` that indicates if a variant is a transmitted singleton
    :param mono_allelic_fiter_expr: Expression indicating if a variant is mono-allelic
    :param snp_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for SNP variant QC filter
    :param indel_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for indel variant QC filter
    :param inbreeding_coeff_cutoff: InbreedingCoeff hard filter to use for variants
    :param determine_cutoff_from_bin: If True RF probability will be determined using bin info in `aggregated_bin_ht`
    :param aggregated_bin_ht: File with aggregate counts of variants based on quantile bins
    :param bin_id: Name of bin to use in 'bin_id' column of `aggregated_bin_ht` to use to determine probability cutoff
    :return: Finalized random forest Table annotated with variant filters
    """
    # Determine SNP and indel RF cutoffs if given bin instead of RF probability

    snp_cutoff_global = hl.struct(min_score=snp_cutoff)
    indel_cutoff_global = hl.struct(min_score=indel_cutoff)

    # Add filters to RF HT
    filters = dict()

    if ht.any(hl.is_missing(ht.rf_probability["TP"])):
        raise ValueError("Missing RF probability!")

    filters["RF"] = (
        hl.is_snp(ht.alleles[0], ht.alleles[1])
        & (ht.rf_probability["TP"] < snp_cutoff_global.min_score)) | (
            ~hl.is_snp(ht.alleles[0], ht.alleles[1])
            & (ht.rf_probability["TP"] < indel_cutoff_global.min_score))

    # Fix annotations for release
    annotations_expr = {
        "rf_positive_label": hl.or_else(ht.tp, False),
        "rf_negative_label": ht.fail_hard_filters,
        "rf_probability": ht.rf_probability["TP"],
    }

    ht = ht.transmute(filters=add_filters_expr(filters=filters),
                      **annotations_expr)

    ht = ht.annotate_globals(rf_snv_cutoff=snp_cutoff_global,
                             rf_indel_cutoff=indel_cutoff_global)

    return ht
Ejemplo n.º 4
0
def filter_ht_for_plink(ht: hl.Table,
                        n_samples: int,
                        min_call_rate: float = 0.95,
                        variants_per_mac_category: int = 2000,
                        variants_per_maf_category: int = 10000):
    from gnomad.utils.filtering import filter_to_autosomes
    ht = filter_to_autosomes(ht)
    ht = ht.filter((ht.call_stats.AN >= n_samples * 2 * min_call_rate)
                   & (ht.call_stats.AC > 0))
    ht = ht.annotate(mac_category=mac_category_case_builder(ht.call_stats))
    category_counter = ht.aggregate(hl.agg.counter(ht.mac_category))
    print(category_counter)
    ht = ht.annotate_globals(category_counter=category_counter)
    return ht.filter(
        hl.rand_unif(
            0, 1) < hl.cond(ht.mac_category >= 1, variants_per_mac_category,
                            variants_per_maf_category) /
        ht.category_counter[ht.mac_category])
def create_grouped_bin_ht(ht: hl.Table,
                          model_id: str,
                          overwrite: bool = False) -> None:
    """
    Creates binned data from a quantile bin annotated Table grouped by bin_id (rank, bi-allelic, etc.), contig, snv,
    bi_allelic and singleton containing the information needed for evaluation plots.
    :param str model_id: Which data/run hash is being created
    :param bool overwrite: Should output files be overwritten if present
    :return: None
    :rtype: None
    """
    trio_stats_ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_trios_stats.ht')
    # Count variants for ranking
    count_expr = {
        x: hl.agg.filter(
            hl.is_defined(ht[x]),
            hl.agg.counter(
                hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv",
                        "indel")),
        )
        for x in ht.row if x.endswith("bin")
    }
    bin_variant_counts = ht.aggregate(hl.struct(**count_expr))
    logger.info(
        f"Found the following variant counts:\n {pformat(bin_variant_counts)}")
    ht = ht.annotate_globals(bin_variant_counts=bin_variant_counts)

    logger.info(f"Creating grouped bin table...")
    grouped_binned_ht = compute_grouped_binned_ht(
        ht,
        checkpoint_path=(f'{tmp_dir}/ddd-elgh-ukbb/{model_id}_grouped_bin.ht'),
    )

    logger.info(f"Aggregating grouped bin table...")
    agg_ht = grouped_binned_ht.aggregate(
        **score_bin_agg(grouped_binned_ht, fam_stats_ht=trio_stats_ht))

    return agg_ht
Ejemplo n.º 6
0
def liftover_intervals(t: hl.Table,
                       keep_missing_interval: bool = False) -> hl.Table:
    """
    Liftover locus in intervals from one coordinate system (hg37) to another (hg38)

    # Example input table description
    #
    # ----------------------------------------
    # Global fields:
    #     None
    # ----------------------------------------
    # Row fields:
    #     'interval': interval<locus<GRCh37>>
    # ----------------------------------------
    # Key: ['interval']
    # ----------------------------------------


    :param t: Table of intervals on GRCh37
    :param keep_missing_interval: If True, keep missing (non-lifted) intervals in the output Table.
    :return: Table with intervals lifted over GRCh38 added.
    """

    rg37 = hl.get_reference("GRCh37")
    rg38 = hl.get_reference("GRCh38")

    if not rg37.has_liftover("GRCh38"):
        rg37.add_liftover(
            f'{nfs_dir}/resources/liftover/grch37_to_grch38.over.chain.gz',
            rg38)

    t = t.annotate(
        start=hl.liftover(t.interval.start, "GRCh38"),
        end=hl.liftover(t.interval.end, "GRCh38"),
    )

    t = t.filter((t.start.contig == "chr" + t.interval.start.contig)
                 & (t.end.contig == "chr" + t.interval.end.contig))

    t = t.key_by()

    t = (t.select(interval=hl.locus_interval(t.start.contig,
                                             t.start.position,
                                             t.end.position,
                                             reference_genome=rg38,
                                             invalid_missing=True),
                  interval_hg37=t.interval))

    # bad intervals
    missing = t.aggregate(hl.agg.counter(~hl.is_defined(t.interval)))
    logger.info(
        f"Number of missing intervals: {missing[True]} out of {t.count()}...")

    # update globals annotations
    global_ann_expr = {
        'date': current_date(),
        'reference_genome': 'GRCh38',
        'was_lifted': True
    }
    t = t.annotate_globals(**global_ann_expr)

    if not keep_missing_interval:
        logger.info(f"Filtering out {missing[True]} missing intervals...")
        t = t.filter(hl.is_defined(t.interval), keep=True)

    return t.key_by("interval")
def create_binned_data_initial(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table:
    # Count variants for ranking
    count_expr = {x: hl.agg.filter(hl.is_defined(ht[x]), hl.agg.counter(hl.cond(hl.is_snp(
        ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank')}
    rank_variant_counts = ht.aggregate(hl.Struct(**count_expr))
    logger.info(
        f"Found the following variant counts:\n {pformat(rank_variant_counts)}")
    ht_truth_data = hl.read_table(
        f"{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht")
    ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts)
    ht = ht.annotate(
        **ht_truth_data[ht.key],
        # **fam_ht[ht.key],
        # **gnomad_ht[ht.key],
        # **denovo_ht[ht.key],
        # clinvar=hl.is_defined(clinvar_ht[ht.key]),
        indel_length=hl.abs(ht.alleles[0].length()-ht.alleles[1].length()),
        rank_bins=hl.array(
            [hl.Struct(
                rank_id=rank_name,
                bin=hl.int(hl.ceil(hl.float(ht[rank_name] + 1) / hl.floor(ht.globals.rank_variant_counts[rank_name][hl.cond(
                    hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins)))
            )
                for rank_name in rank_variant_counts]
        ),
        # lcr=hl.is_defined(lcr_intervals[ht.locus])
    )

    ht = ht.explode(ht.rank_bins)
    ht = ht.transmute(
        rank_id=ht.rank_bins.rank_id,
        bin=ht.rank_bins.bin
    )
    ht = ht.filter(hl.is_defined(ht.bin))

    ht = ht.checkpoint(
        f'{tmp_dir}/gnomad_score_binning_tmp.ht', overwrite=True)

    # Create binned data
    return (
        ht
        .group_by(
            rank_id=ht.rank_id,
            contig=ht.locus.contig,
            snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
            bi_allelic=hl.is_defined(ht.biallelic_rank),
            singleton=ht.transmitted_singleton,
            trans_singletons=hl.is_defined(ht.singleton_rank),
            de_novo_high_quality=ht.de_novo_high_quality_rank,
            de_novo_medium_quality=hl.is_defined(
                ht.de_novo_medium_quality_rank),
            de_novo_synonymous=hl.is_defined(ht.de_novo_synonymous_rank),
            # release_adj=ht.ac > 0,
            bin=ht.bin
        )._set_buffer_size(20000)
        .aggregate(
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n=hl.agg.count(),
            n_ins=hl.agg.count_where(
                hl.is_insertion(ht.alleles[0], ht.alleles[1])),
            n_del=hl.agg.count_where(
                hl.is_deletion(ht.alleles[0], ht.alleles[1])),
            n_ti=hl.agg.count_where(hl.is_transition(
                ht.alleles[0], ht.alleles[1])),
            n_tv=hl.agg.count_where(hl.is_transversion(
                ht.alleles[0], ht.alleles[1])),
            n_1bp_indel=hl.agg.count_where(ht.indel_length == 1),
            n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0),
            # n_clinvar=hl.agg.count_where(ht.clinvar),
            n_singleton=hl.agg.count_where(ht.transmitted_singleton),
            n_high_quality_de_novos=hl.agg.count_where(
                ht.de_novo_data.p_de_novo[0] > 0.99),
            n_validated_DDD_denovos=hl.agg.count_where(
                ht.inheritance.contains("De novo")),
            n_medium_quality_de_novos=hl.agg.count_where(
                ht.de_novo_data.p_de_novo[0] > 0.5),
            n_high_confidence_de_novos=hl.agg.count_where(
                ht.de_novo_data.confidence[0] == 'HIGH'),
            n_de_novo=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.sum(
                ht.family_stats.mendel[0].errors)),
            n_high_quality_de_novos_synonymous=hl.agg.count_where(
                (ht.de_novo_data.p_de_novo[0] > 0.99) & (ht.consequence == "synonymous_variant")),
            # n_de_novo_no_lcr=hl.agg.filter(~ht.lcr & (
            #    ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_sites=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.count_where(
                ht.family_stats.mendel[0].errors > 0)),
            # n_de_novo_sites_no_lcr=hl.agg.filter(~ht.lcr & (
            #    ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_trans_singletons=hl.agg.filter((ht.ac_raw < 3) & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)),
            n_trans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)),
            n_untrans_singletons=hl.agg.filter((ht.ac_raw < 3) & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)),
            n_untrans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)),
            n_train_trans_singletons=hl.agg.count_where(
                (ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1) & (ht.family_stats.tdt[0].t == 1)),
            n_omni=hl.agg.count_where(ht.omni),
            n_mills=hl.agg.count_where(ht.mills),
            n_hapmap=hl.agg.count_where(ht.hapmap),
            n_kgp_high_conf_snvs=hl.agg.count_where(
                ht.kgp_phase1_hc),
            fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters),
            # n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site),
            # n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)
        )
    )
Ejemplo n.º 8
0
def get_summary_counts(
    ht: hl.Table,
    freq_field: str = "freq",
    filter_field: str = "filters",
    filter_decoy: bool = False,
    index: int = 0,
) -> hl.Table:
    """
    Generate a struct with summary counts across variant categories.

    Summary counts:
        - Number of variants
        - Number of indels
        - Number of SNVs
        - Number of LoF variants
        - Number of LoF variants that pass LOFTEE (including with LoF flags)
        - Number of LoF variants that pass LOFTEE without LoF flags
        - Number of OS (other splice) variants annotated by LOFTEE
        - Number of LoF variants that fail LOFTEE filters

    Also annotates Table's globals with total variant counts.

    Before calculating summary counts, function:
        - Filters out low confidence regions
        - Filters to canonical transcripts
        - Uses the most severe consequence

    Assumes that:
        - Input HT is annotated with VEP.
        - Multiallelic variants have been split and/or input HT contains bi-allelic variants only.
        - freq_expr was calculated with `annotate_freq`.
        - (Frequency index 0 from `annotate_freq` is frequency for all pops calculated on adj genotypes only.)

    :param ht: Input Table.
    :param freq_field: Name of field in HT containing frequency annotation (array of structs). Default is "freq".
    :param filter_field: Name of field in HT containing variant filter information. Default is "filters".
    :param filter_decoy: Whether to filter decoy regions. Default is False.
    :param index: Which index of freq_expr to use for annotation. Default is 0.
    :return: Table grouped by frequency bin and aggregated across summary count categories.
    """
    logger.info("Checking if multi-allelic variants have been split...")
    max_alleles = ht.aggregate(hl.agg.max(hl.len(ht.alleles)))
    if max_alleles > 2:
        logger.info(
            "Splitting multi-allelics and VEP transcript consequences...")
        ht = hl.split_multi_hts(ht)

    logger.info("Filtering to PASS variants in high confidence regions...")
    ht = ht.filter((hl.len(ht[filter_field]) == 0))
    ht = filter_low_conf_regions(ht, filter_decoy=filter_decoy)

    logger.info(
        "Filtering to canonical transcripts and getting VEP summary annotations..."
    )
    ht = filter_vep_to_canonical_transcripts(ht)
    ht = get_most_severe_consequence_for_summary(ht)

    logger.info("Annotating with frequency bin information...")
    ht = ht.annotate(freq_bin=freq_bin_expr(ht[freq_field], index))

    logger.info(
        "Annotating HT globals with total counts/total allele counts per variant category..."
    )
    summary_counts = ht.aggregate(
        hl.struct(**get_summary_counts_dict(
            ht.locus,
            ht.alleles,
            ht.lof,
            ht.no_lof_flags,
            ht.most_severe_csq,
            prefix_str="total_",
        )))
    summary_ac_counts = ht.aggregate(
        hl.struct(**get_summary_ac_dict(
            ht[freq_field][index].AC,
            ht.lof,
            ht.no_lof_flags,
            ht.most_severe_csq,
        )))
    ht = ht.annotate_globals(summary_counts=summary_counts.annotate(
        **summary_ac_counts))
    return ht.group_by("freq_bin").aggregate(**get_summary_counts_dict(
        ht.locus,
        ht.alleles,
        ht.lof,
        ht.no_lof_flags,
        ht.most_severe_csq,
    ))
Ejemplo n.º 9
0
def generate_final_rf_ht(
    ht: hl.Table,
    ac0_filter_expr: hl.expr.BooleanExpression,
    ts_ac_filter_expr: hl.expr.BooleanExpression,
    mono_allelic_fiter_expr: hl.expr.BooleanExpression,
    snp_cutoff: Union[int, float],
    indel_cutoff: Union[int, float],
    inbreeding_coeff_cutoff: float = INBREEDING_COEFF_HARD_CUTOFF,
    determine_cutoff_from_bin: bool = False,
    aggregated_bin_ht: Optional[hl.Table] = None,
    bin_id: Optional[hl.expr.Int32Expression] = None,
) -> hl.Table:
    """
    Prepares finalized RF model given an RF result table from `rf.apply_rf_model` and cutoffs for filtering.
    If `determine_cutoff_from_bin` is True, `aggregated_bin_ht` must be supplied to determine the SNP and indel RF
    probabilities to use as cutoffs from an aggregated quantile bin Table like one created by
    `compute_grouped_binned_ht` in combination with `score_bin_agg`.
    :param ht: RF result table from `rf.apply_rf_model` to prepare as the final RF Table
    :param ac0_filter_expr: Expression that indicates if a variant should be filtered as allele count 0 (AC0)
    :param ts_ac_filter_expr: Expression in `ht` that indicates if a variant is a transmitted singleton
    :param mono_allelic_fiter_expr: Expression indicating if a variant is mono-allelic
    :param snp_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for SNP variant QC filter
    :param indel_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for indel variant QC filter
    :param inbreeding_coeff_cutoff: InbreedingCoeff hard filter to use for variants
    :param determine_cutoff_from_bin: If True RF probability will be determined using bin info in `aggregated_bin_ht`
    :param aggregated_bin_ht: File with aggregate counts of variants based on quantile bins
    :param bin_id: Name of bin to use in 'bin_id' column of `aggregated_bin_ht` to use to determine probability cutoff
    :return: Finalized random forest Table annotated with variant filters
    """
    # Determine SNP and indel RF cutoffs if given bin instead of RF probability
    if determine_cutoff_from_bin:
        snp_rf_cutoff, indel_rf_cutoff = aggregated_bin_ht.aggregate([
            hl.agg.filter(
                snv
                & (aggregated_bin_ht.bin_id == bin_id)
                & (aggregated_bin_ht.bin == cutoff),
                hl.agg.min(aggregated_bin_ht.min_score),
            ) for snv, cutoff in [
                (aggregated_bin_ht.snv, snp_cutoff),
                (~aggregated_bin_ht.snv, indel_cutoff),
            ]
        ])
        snp_cutoff_global = hl.struct(bin=snp_cutoff, min_score=snp_rf_cutoff)
        indel_cutoff_global = hl.struct(bin=indel_cutoff,
                                        min_score=indel_rf_cutoff)

        logger.info(
            f"Using a SNP RF probability cutoff of {snp_rf_cutoff} and an indel RF probability cutoff of {indel_rf_cutoff}."
        )
    else:
        snp_cutoff_global = hl.struct(min_score=snp_cutoff)
        indel_cutoff_global = hl.struct(min_score=indel_cutoff)

    # Add filters to RF HT
    filters = dict()

    if ht.any(hl.is_missing(ht.rf_probability["TP"])):
        raise ValueError("Missing RF probability!")

    filters["RF"] = (
        hl.is_snp(ht.alleles[0], ht.alleles[1])
        & (ht.rf_probability["TP"] < snp_cutoff_global.min_score)) | (
            ~hl.is_snp(ht.alleles[0], ht.alleles[1])
            & (ht.rf_probability["TP"] < indel_cutoff_global.min_score))

    filters["InbreedingCoeff"] = hl.or_else(
        ht.InbreedingCoeff < inbreeding_coeff_cutoff, False)
    filters["AC0"] = ac0_filter_expr
    filters[
        "MonoAllelic"] = mono_allelic_fiter_expr  # TODO: Do others agree that we should add this to gnomAD like we did for UKBB?

    # Fix annotations for release
    annotations_expr = {
        "rf_positive_label":
        hl.or_else(ht.tp, False),
        "rf_negative_label":
        ht.fail_hard_filters,
        "transmitted_singleton":
        hl.or_missing(ts_ac_filter_expr, ht.transmitted_singleton),
        "rf_probability":
        ht.rf_probability["TP"],
    }
    if "feature_imputed" in ht.row:
        annotations_expr.update({
            x: hl.or_missing(~ht.feature_imputed[x], ht[x])
            for x in [f for f in ht.row.feature_imputed]
        })

    ht = ht.transmute(filters=add_filters_expr(filters=filters),
                      **annotations_expr)

    ht = ht.annotate_globals(rf_snv_cutoff=snp_cutoff_global,
                             rf_indel_cutoff=indel_cutoff_global)

    return ht
def create_binned_data(ht: hl.Table, data: str, data_type: str,
                       n_bins: int) -> hl.Table:
    """
    Creates binned data from a rank Table grouped by rank_id (rank, biallelic, etc.), contig, snv, bi_allelic and singleton
    containing the information needed for evaluation plots.

    :param Table ht: Input rank table
    :param str data: Which data/run hash is being created
    :param str data_type: one of 'exomes' or 'genomes'
    :param int n_bins: Number of bins.
    :return: Binned Table
    :rtype: Table
    """

    # Count variants for ranking
    count_expr = {
        x: hl.agg.filter(
            hl.is_defined(ht[x]),
            hl.agg.counter(
                hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv',
                        'indel')))
        for x in ht.row if x.endswith('rank')
    }
    rank_variant_counts = ht.aggregate(hl.Struct(**count_expr))
    logger.info(
        f"Found the following variant counts:\n {pformat(rank_variant_counts)}"
    )
    ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts)

    # Load external evaluation data
    clinvar_ht = hl.read_table(clinvar_ht_path)
    denovo_ht = get_validated_denovos_ht()
    if data_type == 'exomes':
        denovo_ht = denovo_ht.filter(denovo_ht.gnomad_exomes.high_quality)
    else:
        denovo_ht = denovo_ht.filter(denovo_ht.gnomad_genomes.high_quality)
    denovo_ht = denovo_ht.select(
        validated_denovo=denovo_ht.validated,
        high_confidence_denovo=denovo_ht.Confidence == 'HIGH')
    ht_truth_data = hl.read_table(annotations_ht_path(data_type, 'truth_data'))
    fam_ht = hl.read_table(annotations_ht_path(data_type, 'family_stats'))
    fam_ht = fam_ht.select(family_stats=fam_ht.family_stats[0])
    gnomad_ht = get_gnomad_data(data_type).rows()
    gnomad_ht = gnomad_ht.select(
        vqsr_negative_train_site=gnomad_ht.info.NEGATIVE_TRAIN_SITE,
        vqsr_positive_train_site=gnomad_ht.info.POSITIVE_TRAIN_SITE,
        fail_hard_filters=(gnomad_ht.info.QD < 2) | (gnomad_ht.info.FS > 60) |
        (gnomad_ht.info.MQ < 30))
    lcr_intervals = hl.import_locus_intervals(lcr_intervals_path)

    ht = ht.annotate(
        **ht_truth_data[ht.key],
        **fam_ht[ht.key],
        **gnomad_ht[ht.key],
        **denovo_ht[ht.key],
        clinvar=hl.is_defined(clinvar_ht[ht.key]),
        indel_length=hl.abs(ht.alleles[0].length() - ht.alleles[1].length()),
        rank_bins=hl.array([
            hl.Struct(
                rank_id=rank_name,
                bin=hl.int(
                    hl.ceil(
                        hl.float(ht[rank_name] + 1) / hl.floor(
                            ht.globals.rank_variant_counts[rank_name][hl.cond(
                                hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv',
                                'indel')] / n_bins))))
            for rank_name in rank_variant_counts
        ]),
        lcr=hl.is_defined(lcr_intervals[ht.locus]))

    ht = ht.explode(ht.rank_bins)
    ht = ht.transmute(rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin)
    ht = ht.filter(hl.is_defined(ht.bin))

    ht = ht.checkpoint(
        f'gs://gnomad-tmp/gnomad_score_binning_{data_type}_tmp_{data}.ht',
        overwrite=True)

    # Create binned data
    return (ht.group_by(
        rank_id=ht.rank_id,
        contig=ht.locus.contig,
        snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
        bi_allelic=hl.is_defined(ht.biallelic_rank),
        singleton=ht.singleton,
        release_adj=ht.ac > 0,
        bin=ht.bin)._set_buffer_size(20000).aggregate(
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n=hl.agg.count(),
            n_ins=hl.agg.count_where(
                hl.is_insertion(ht.alleles[0], ht.alleles[1])),
            n_del=hl.agg.count_where(
                hl.is_deletion(ht.alleles[0], ht.alleles[1])),
            n_ti=hl.agg.count_where(
                hl.is_transition(ht.alleles[0], ht.alleles[1])),
            n_tv=hl.agg.count_where(
                hl.is_transversion(ht.alleles[0], ht.alleles[1])),
            n_1bp_indel=hl.agg.count_where(ht.indel_length == 1),
            n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0),
            n_clinvar=hl.agg.count_where(ht.clinvar),
            n_singleton=hl.agg.count_where(ht.singleton),
            n_validated_de_novos=hl.agg.count_where(ht.validated_denovo),
            n_high_confidence_de_novos=hl.agg.count_where(
                ht.high_confidence_denovo),
            n_de_novo=hl.agg.filter(
                ht.family_stats.unrelated_qc_callstats.AC[1] == 0,
                hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_no_lcr=hl.agg.filter(
                ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0),
                hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_sites=hl.agg.filter(
                ht.family_stats.unrelated_qc_callstats.AC[1] == 0,
                hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_de_novo_sites_no_lcr=hl.agg.filter(
                ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0),
                hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_trans_singletons=hl.agg.filter(
                (ht.info_ac < 3) &
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1),
                hl.agg.sum(ht.family_stats.tdt.t)),
            n_untrans_singletons=hl.agg.filter(
                (ht.info_ac < 3) &
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1),
                hl.agg.sum(ht.family_stats.tdt.u)),
            n_train_trans_singletons=hl.agg.count_where(
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1)
                & (ht.family_stats.tdt.t == 1)),
            n_omni=hl.agg.count_where(ht.truth_data.omni),
            n_mills=hl.agg.count_where(ht.truth_data.mills),
            n_hapmap=hl.agg.count_where(ht.truth_data.hapmap),
            n_kgp_high_conf_snvs=hl.agg.count_where(
                ht.truth_data.kgp_high_conf_snvs),
            fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters),
            n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site),
            n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)))
Ejemplo n.º 11
0
def compute_stratified_metrics_filter(
    ht: hl.Table,
    qc_metrics: Dict[str, hl.expr.NumericExpression],
    strata: Optional[Dict[str, hl.expr.Expression]] = None,
    lower_threshold: float = 4.0,
    upper_threshold: float = 4.0,
    metric_threshold: Optional[Dict[str, Tuple[float, float]]] = None,
    filter_name: str = "qc_metrics_filters",
) -> hl.Table:
    """
    Compute median, MAD, and upper and lower thresholds for each metric used in outlier filtering.

    :param ht: HT containing relevant sample QC metric annotations
    :param qc_metrics: list of metrics (name and expr) for which to compute the critical values for filtering outliers
    :param strata: List of annotations used for stratification. These metrics should be discrete types!
    :param lower_threshold: Lower MAD threshold
    :param upper_threshold: Upper MAD threshold
    :param metric_threshold: Can be used to specify different (lower, upper) thresholds for one or more metrics
    :param filter_name: Name of resulting filters annotation
    :return: Table grouped by strata, with upper and lower threshold values computed for each sample QC metric
    """
    _metric_threshold = {
        metric: (lower_threshold, upper_threshold)
        for metric in qc_metrics
    }
    if metric_threshold is not None:
        _metric_threshold.update(metric_threshold)

    def make_filters_expr(ht: hl.Table,
                          qc_metrics: Iterable[str]) -> hl.expr.SetExpression:
        return hl.set(
            hl.filter(
                lambda x: hl.is_defined(x),
                [
                    hl.or_missing(ht[f"fail_{metric}"], metric)
                    for metric in qc_metrics
                ],
            ))

    if strata is None:
        strata = {}

    ht = ht.select(**qc_metrics, **strata).key_by("s").persist()

    agg_expr = hl.struct(
        **{
            metric: hl.bind(
                lambda x: x.annotate(
                    lower=x.median - _metric_threshold[metric][0] * x.mad,
                    upper=x.median + _metric_threshold[metric][1] * x.mad,
                ),
                get_median_and_mad_expr(ht[metric]),
            )
            for metric in qc_metrics
        })

    if strata:
        ht = ht.annotate_globals(qc_metrics_stats=ht.aggregate(
            hl.agg.group_by(hl.tuple([ht[x] for x in strata]), agg_expr),
            _localize=False,
        ))
        metrics_stats_expr = ht.qc_metrics_stats[hl.tuple(
            [ht[x] for x in strata])]
    else:
        ht = ht.annotate_globals(
            qc_metrics_stats=ht.aggregate(agg_expr, _localize=False))
        metrics_stats_expr = ht.qc_metrics_stats

    fail_exprs = {
        f"fail_{metric}": (ht[metric] <= metrics_stats_expr[metric].lower)
        | (ht[metric] >= metrics_stats_expr[metric].upper)
        for metric in qc_metrics
    }
    ht = ht.transmute(**fail_exprs)
    stratified_filters = make_filters_expr(ht, qc_metrics)
    return ht.annotate(**{filter_name: stratified_filters})
Ejemplo n.º 12
0
def median_impute_features(
        ht: hl.Table,
        strata: Optional[Dict[str, hl.expr.Expression]] = None) -> hl.Table:
    """
    Numerical features in the Table are median-imputed by Hail's `approx_median`.

    If a `strata` dict is given, imputation is done based on the median of of each stratification.

    The annotations that are added to the Table are
        - feature_imputed - A row annotation indicating if each numerical feature was imputed or not.
        - features_median - A global annotation containing the median of the numerical features. If `strata` is given,
          this struct will also be broken down by the given strata.
        - variants_by_strata - An additional global annotation with the variant counts by strata that will only be
          added if imputing by a given `strata`.

    :param ht: Table containing all samples and features for median imputation.
    :param strata: Whether to impute features median by specific strata (default False).
    :return: Feature Table imputed using approximate median values.
    """

    logger.info(
        "Computing feature medians for imputation of missing numeric values")
    numerical_features = [
        k for k, v in ht.row.dtype.items() if v == hl.tint or v == hl.tfloat
    ]

    median_agg_expr = hl.struct(
        **{
            feature: hl.agg.approx_median(ht[feature])
            for feature in numerical_features
        })

    if strata:
        ht = ht.annotate_globals(
            feature_medians=ht.aggregate(
                hl.agg.group_by(hl.tuple([ht[x] for x in strata]),
                                median_agg_expr),
                _localize=False,
            ),
            variants_by_strata=ht.aggregate(hl.agg.counter(
                hl.tuple([ht[x] for x in strata])),
                                            _localize=False),
        )
        feature_median_expr = ht.feature_medians[hl.tuple(
            [ht[x] for x in strata])]
        logger.info("Variant count by strata:\n{}".format("\n".join([
            "{}: {}".format(k, v)
            for k, v in hl.eval(ht.variants_by_strata).items()
        ])))

    else:
        ht = ht.annotate_globals(
            feature_medians=ht.aggregate(median_agg_expr, _localize=False))
        feature_median_expr = ht.feature_medians

    ht = ht.annotate(
        **{
            f: hl.or_else(ht[f], feature_median_expr[f])
            for f in numerical_features
        },
        feature_imputed=hl.struct(
            **{f: hl.is_missing(ht[f])
               for f in numerical_features}),
    )

    return ht
Ejemplo n.º 13
0
def generate_final_filter_ht(
    ht: hl.Table,
    model_name: str,
    score_name: str,
    ac0_filter_expr: hl.expr.BooleanExpression,
    ts_ac_filter_expr: hl.expr.BooleanExpression,
    mono_allelic_flag_expr: hl.expr.BooleanExpression,
    inbreeding_coeff_cutoff: float = INBREEDING_COEFF_HARD_CUTOFF,
    snp_bin_cutoff: int = None,
    indel_bin_cutoff: int = None,
    snp_score_cutoff: float = None,
    indel_score_cutoff: float = None,
    aggregated_bin_ht: Optional[hl.Table] = None,
    bin_id: Optional[str] = None,
    vqsr_ht: hl.Table = None,
) -> hl.Table:
    """
    Prepares finalized filtering model given a filtering HT from `rf.apply_rf_model` or VQSR and cutoffs for filtering.

    .. note::

        - `snp_bin_cutoff` and `snp_score_cutoff` are mutually exclusive, and one must be supplied.
        - `indel_bin_cutoff` and `indel_score_cutoff` are mutually exclusive, and one must be supplied.
        - If a `snp_bin_cutoff` or `indel_bin_cutoff` cutoff is supplied then an `aggregated_bin_ht` and `bin_id` must
          also be supplied to determine the SNP and indel scores to use as cutoffs from an aggregated bin Table like
          one created by `compute_grouped_binned_ht` in combination with `score_bin_agg`.

    :param ht: Filtering Table from `rf.apply_rf_model` or VQSR to prepare as the final filter Table
    :param model_name: Filtering model name to use in the 'filters' field (VQSR or RF)
    :param score_name: Name to use for the filtering score annotation. This will be used in place of 'score' in the
        release HT info struct and the INFO field of the VCF (e.g. RF or AS_VQSLOD)
    :param ac0_filter_expr: Expression that indicates if a variant should be filtered as allele count 0 (AC0)
    :param ts_ac_filter_expr: Allele count expression in `ht` to use as a filter for determining a transmitted singleton
    :param mono_allelic_flag_expr: Expression indicating if a variant is mono-allelic
    :param inbreeding_coeff_cutoff: InbreedingCoeff hard filter to use for variants
    :param snp_bin_cutoff: Bin cutoff to use for SNP variant QC filter. Can't be used with `snp_score_cutoff`
    :param indel_bin_cutoff: Bin cutoff to use for indel variant QC filter. Can't be used with `indel_score_cutoff`
    :param snp_score_cutoff: Score cutoff (e.g. RF probability or AS_VQSLOD) to use for SNP variant QC filter. Can't be used with `snp_bin_cutoff`
    :param indel_score_cutoff: Score cutoff (e.g. RF probability or AS_VQSLOD) to use for indel variant QC filter. Can't be used with `indel_bin_cutoff`
    :param aggregated_bin_ht: Table with aggregate counts of variants based on bins
    :param bin_id: Name of bin to use in 'bin_id' column of `aggregated_bin_ht` to use to determine probability cutoff
    :param vqsr_ht: If a VQSR HT is supplied a 'vqsr' annotation containing AS_VQSLOD, AS_culprit, NEGATIVE_TRAIN_SITE,
        and POSITIVE_TRAIN_SITE will be included in the returned Table
    :return: Finalized random forest Table annotated with variant filters
    """
    if snp_bin_cutoff is not None and snp_score_cutoff is not None:
        raise ValueError(
            "snp_bin_cutoff and snp_score_cutoff are mutually exclusive, please only supply one SNP filtering cutoff."
        )

    if indel_bin_cutoff is not None and indel_score_cutoff is not None:
        raise ValueError(
            "indel_bin_cutoff and indel_score_cutoff are mutually exclusive, please only supply one indel filtering cutoff."
        )

    if snp_bin_cutoff is None and snp_score_cutoff is None:
        raise ValueError(
            "One (and only one) of the parameters snp_bin_cutoff and snp_score_cutoff must be supplied."
        )

    if indel_bin_cutoff is None and indel_score_cutoff is None:
        raise ValueError(
            "One (and only one) of the parameters indel_bin_cutoff and indel_score_cutoff must be supplied."
        )

    if (snp_bin_cutoff is not None or indel_bin_cutoff
            is not None) and (aggregated_bin_ht is None or bin_id is None):
        raise ValueError(
            "If using snp_bin_cutoff or indel_bin_cutoff, both aggregated_bin_ht and bin_id must be supplied"
        )

    # Determine SNP and indel score cutoffs if given bin instead of score
    if snp_bin_cutoff:
        snp_score_cutoff = aggregated_bin_ht.aggregate(
            hl.agg.filter(
                aggregated_bin_ht.snv
                & (aggregated_bin_ht.bin_id == bin_id)
                & (aggregated_bin_ht.bin == snp_bin_cutoff),
                hl.agg.min(aggregated_bin_ht.min_score),
            ))
        snp_cutoff_global = hl.struct(bin=snp_bin_cutoff,
                                      min_score=snp_score_cutoff)

    if indel_bin_cutoff:
        indel_score_cutoff = aggregated_bin_ht.aggregate(
            hl.agg.filter(
                ~aggregated_bin_ht.snv
                & (aggregated_bin_ht.bin_id == bin_id)
                & (aggregated_bin_ht.bin == indel_bin_cutoff),
                hl.agg.min(aggregated_bin_ht.min_score),
            ))
        indel_cutoff_global = hl.struct(bin=indel_bin_cutoff,
                                        min_score=indel_score_cutoff)

    min_score = ht.aggregate(hl.agg.min(ht.score))
    max_score = ht.aggregate(hl.agg.max(ht.score))

    if snp_score_cutoff:
        if snp_score_cutoff < min_score or snp_score_cutoff > max_score:
            raise ValueError(
                "snp_score_cutoff is not within the range of score.")
        snp_cutoff_global = hl.struct(min_score=snp_score_cutoff)

    if indel_score_cutoff:
        if indel_score_cutoff < min_score or indel_score_cutoff > max_score:
            raise ValueError(
                "indel_score_cutoff is not within the range of score.")
        indel_cutoff_global = hl.struct(min_score=indel_score_cutoff)

    logger.info(
        f"Using a SNP score cutoff of {snp_score_cutoff} and an indel score cutoff of {indel_score_cutoff}."
    )

    # Add filters to HT
    filters = dict()

    if ht.any(hl.is_missing(ht.score)):
        ht.filter(hl.is_missing(ht.score)).show()
        raise ValueError("Missing Score!")

    filters[model_name] = (hl.is_missing(ht.score)
                           | (hl.is_snp(ht.alleles[0], ht.alleles[1])
                              & (ht.score < snp_cutoff_global.min_score))
                           | (~hl.is_snp(ht.alleles[0], ht.alleles[1])
                              & (ht.score < indel_cutoff_global.min_score)))

    filters["InbreedingCoeff"] = hl.or_else(
        ht.InbreedingCoeff < inbreeding_coeff_cutoff, False)
    filters["AC0"] = ac0_filter_expr

    annotations_expr = dict()
    if model_name == "RF":
        # Fix annotations for release
        annotations_expr = annotations_expr.update({
            "positive_train_site":
            hl.or_else(ht.positive_train_site, False),
            "rf_tp_probability":
            ht.rf_probability["TP"],
        })
    annotations_expr.update({
        "transmitted_singleton":
        hl.or_missing(ts_ac_filter_expr, ht.transmitted_singleton)
    })
    if "feature_imputed" in ht.row:
        annotations_expr.update({
            x: hl.or_missing(~ht.feature_imputed[x], ht[x])
            for x in [f for f in ht.row.feature_imputed]
        })

    ht = ht.transmute(
        filters=add_filters_expr(filters=filters),
        monoallelic=mono_allelic_flag_expr,
        **{score_name: ht.score},
        **annotations_expr,
    )

    bin_names = [x for x in ht.row if x.endswith("bin")]
    bin_names = [(
        x,
        x.split("adj_")[0] +
        x.split("adj_")[1] if len(x.split("adj_")) == 2 else "raw_" + x,
    ) for x in bin_names]
    ht = ht.transmute(**{j: ht[i] for i, j in bin_names})

    ht = ht.annotate_globals(
        bin_stats=hl.struct(**{j: ht.bin_stats[i]
                               for i, j in bin_names}),
        filtering_model=hl.struct(
            model_name=model_name,
            score_name=score_name,
            snv_cutoff=snp_cutoff_global,
            indel_cutoff=indel_cutoff_global,
        ),
        inbreeding_coeff_cutoff=inbreeding_coeff_cutoff,
    )
    if vqsr_ht:
        vqsr = vqsr_ht[ht.key]
        ht = ht.annotate(
            vqsr=hl.struct(
                AS_VQSLOD=vqsr.info.AS_VQSLOD,
                AS_culprit=vqsr.info.AS_culprit,
                NEGATIVE_TRAIN_SITE=vqsr.info.NEGATIVE_TRAIN_SITE,
                POSITIVE_TRAIN_SITE=vqsr.info.POSITIVE_TRAIN_SITE,
            ),
            SOR=vqsr.info.
            SOR,  # NOTE: This was required for v3.1, we now compute this in `get_site_info_expr`
        )

    ht = ht.drop("AS_culprit")

    return ht