Beispiel #1
0
 def _is_dnm(
     proband_gt: hl.expr.CallExpression,
     father_gt: hl.expr.CallExpression,
     mother_gt: hl.expr.CallExpression,
     locus: hl.expr.LocusExpression,
     proband_is_female: Optional[hl.expr.BooleanExpression],
 ) -> hl.expr.BooleanExpression:
     """
     Helper method to get whether a given genotype combination is a DNM at a given locus with a given proband sex.
     """
     if proband_is_female is None:
         logger.warning(
             "Since no proband sex expression was given to generate_trio_stats_expr, only DNMs in autosomes will be counted."
         )
         return hl.or_missing(
             locus.in_autosome(),
             proband_gt.is_het() & father_gt.is_hom_ref()
             & mother_gt.is_hom_ref(),
         )
     return hl.cond(
         locus.in_autosome_or_par() |
         (proband_is_female & locus.in_x_nonpar()),
         proband_gt.is_het() & father_gt.is_hom_ref()
         & mother_gt.is_hom_ref(),
         hl.or_missing(~proband_is_female,
                       proband_gt.is_hom_var() & father_gt.is_hom_ref()),
     )
Beispiel #2
0
 def _get_copy_state(
         locus: hl.expr.LocusExpression) -> hl.expr.Int32Expression:
     """
     Helper method to go from LocusExpression to a copy-state int for indexing into the
     trans_count_map.
     """
     return (hl.case().when(locus.in_autosome_or_par(), auto_or_par).when(
         locus.in_x_nonpar(), hemi_x).when(locus.in_y_nonpar(),
                                           hemi_y).or_missing())
Beispiel #3
0
def faf_expr(
    freq: hl.expr.ArrayExpression,
    freq_meta: hl.expr.ArrayExpression,
    locus: hl.expr.LocusExpression,
    pops_to_exclude: Optional[Set[str]] = None,
    faf_thresholds: List[float] = [0.95, 0.99],
) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, str]]]:
    """
    Calculates the filtering allele frequency (FAF) for each threshold specified in `faf_thresholds`.
    See http://cardiodb.org/allelefrequencyapp/ for more information.

    The FAF is computed for each of the following population stratification if found in `freq_meta`:

        - All samples, with adj criteria
        - For each population, with adj criteria
        - For all sex/population on the non-PAR regions of sex chromosomes (will be missing on autosomes and PAR regions of sex chromosomes)

    Each of the FAF entry is a struct with one entry per threshold specified in `faf_thresholds` of type float64.

    This returns a tuple with two expressions:

        1. An array of FAF expressions as described above
        2. An array of dict containing the metadata for each of the array elements, in the same format as that produced by `annotate_freq`.

    :param freq: ArrayExpression of call stats structs (typically generated by hl.agg.call_stats)
    :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (typically generated using annotate_freq)
    :param locus: locus
    :param pops_to_exclude: Set of populations to exclude from faf calculation (typically bottlenecked or consanguineous populations)
    :param faf_thresholds: List of FAF thresholds to compute
    :return: (FAF expression, FAF metadata)
    """
    _pops_to_exclude = (
        hl.literal(pops_to_exclude) if pops_to_exclude is not None else {}
    )

    # pylint: disable=invalid-unary-operand-type
    faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter(
        lambda i: (freq_meta[i].get("group") == "adj")
        & (
            (freq_meta[i].size() == 1)
            | (
                (hl.set(freq_meta[i].keys()) == {"pop", "group"})
                & (~_pops_to_exclude.contains(freq_meta[i]["pop"]))
            )
        )
    )
    sex_faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter(
        lambda i: (freq_meta[i].get("group") == "adj")
        & (freq_meta[i].contains("sex"))
        & (
            (freq_meta[i].size() == 2)
            | (
                (hl.set(freq_meta[i].keys()) == {"pop", "group", "sex"})
                & (~_pops_to_exclude.contains(freq_meta[i]["pop"]))
            )
        )
    )

    faf_expr = faf_freq_indices.map(
        lambda i: hl.struct(
            **{
                f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency(
                    freq[i].AC, freq[i].AN, threshold
                )
                for threshold in faf_thresholds
            }
        )
    )

    faf_expr = faf_expr.extend(
        sex_faf_freq_indices.map(
            lambda i: hl.or_missing(
                ~locus.in_autosome_or_par(),
                hl.struct(
                    **{
                        f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency(
                            freq[i].AC, freq[i].AN, threshold
                        )
                        for threshold in faf_thresholds
                    }
                ),
            )
        )
    )

    faf_meta = faf_freq_indices.extend(sex_faf_freq_indices).map(lambda i: freq_meta[i])
    return faf_expr, hl.eval(faf_meta)
Beispiel #4
0
def get_summary_counts_dict(
    locus_expr: hl.expr.LocusExpression,
    allele_expr: hl.expr.ArrayExpression,
    lof_expr: hl.expr.StringExpression,
    no_lof_flags_expr: hl.expr.BooleanExpression,
    most_severe_csq_expr: hl.expr.StringExpression,
    prefix_str: str = "",
) -> Dict[str, hl.expr.Int64Expression]:
    """
    Return dictionary containing containing counts of multiple variant categories.

    Categories are:
        - Number of variants
        - Number of indels
        - Number of SNVs
        - Number of LoF variants
        - Number of LoF variants that pass LOFTEE
        - Number of LoF variants that pass LOFTEE without any flgs
        - Number of LoF variants annotated as 'other splice' (OS) by LOFTEE
        - Number of LoF variants that fail LOFTEE
        - Number of missense variants
        - Number of synonymous variants
        - Number of autosomal variants
        - Number of allosomal variants

    .. warning::
        Assumes `allele_expr` contains only two variants (multi-allelics have been split).

    :param locus_expr: LocusExpression.
    :param allele_expr: ArrayExpression containing alleles.
    :param lof_expr: StringExpression containing LOFTEE annotation.
    :param no_lof_flags_expr: BooleanExpression indicating whether LoF variant has any flags.
    :param most_severe_csq_expr: StringExpression containing most severe consequence annotation.
    :param prefix_str: Desired prefix string for category names. Default is empty str.
    :return: Dict of categories and counts per category.
    """
    logger.warning(
        "This function expects that multi-allelic variants have been split!")
    return {
        f"{prefix_str}num_variants":
        hl.agg.count(),
        f"{prefix_str}indels":
        hl.agg.count_where(hl.is_indel(allele_expr[0], allele_expr[1])),
        f"{prefix_str}snps":
        hl.agg.count_where(hl.is_snp(allele_expr[0], allele_expr[1])),
        f"{prefix_str}LOF":
        hl.agg.count_where(hl.is_defined(lof_expr)),
        f"{prefix_str}pass_loftee":
        hl.agg.count_where(lof_expr == "HC"),
        f"{prefix_str}pass_loftee_no_flag":
        hl.agg.count_where((lof_expr == "HC") & (no_lof_flags_expr)),
        f"{prefix_str}loftee_os":
        hl.agg.count_where(lof_expr == "OS"),
        f"{prefix_str}fail_loftee":
        hl.agg.count_where(lof_expr == "LC"),
        f"{prefix_str}num_missense":
        hl.agg.count_where(most_severe_csq_expr == "missense_variant"),
        f"{prefix_str}num_synonymous":
        hl.agg.count_where(most_severe_csq_expr == "synonymous_variant"),
        f"{prefix_str}num_autosomal_variants":
        hl.agg.filter(locus_expr.in_autosome_or_par(), hl.agg.count()),
        f"{prefix_str}num_allosomal_variants":
        hl.agg.filter(locus_expr.in_x_nonpar() | locus_expr.in_y_nonpar(),
                      hl.agg.count()),
    }