Example #1
0
def get_site_info_expr(
    mt: hl.MatrixTable,
    sum_agg_fields: Union[List[str], Dict[
        str, hl.expr.NumericExpression]] = INFO_SUM_AGG_FIELDS,
    int32_sum_agg_fields: Union[List[str], Dict[
        str, hl.expr.NumericExpression]] = INFO_INT32_SUM_AGG_FIELDS,
    median_agg_fields: Union[List[str], Dict[
        str, hl.expr.NumericExpression]] = INFO_MEDIAN_AGG_FIELDS,
    array_sum_agg_fields: Union[List[str], Dict[
        str, hl.expr.ArrayNumericExpression]] = INFO_ARRAY_SUM_AGG_FIELDS,
) -> hl.expr.StructExpression:
    """
    Create a site-level annotation Struct aggregating typical VCF INFO fields from GVCF INFO fields stored in the MT entries.

    .. note::

        - If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for the `MQ` calculation and then dropped according to GATK recommendation.
        - If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation and then dropped according to GATK recommendation.
        - If the fields to be aggregate (`sum_agg_fields`, `int32_sum_agg_fields`, `median_agg_fields`) are passed as
          list of str, then they should correspond to entry fields in `mt` or in `mt.gvcf_info`.
        - Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in case of a name clash.

    :param mt: Input Matrix Table
    :param sum_agg_fields: Fields to aggregate using sum.
    :param int32_sum_agg_fields: Fields to aggregate using sum using int32.
    :param median_agg_fields: Fields to aggregate using (approximate) median.
    :return: Expression containing the site-level info fields
    """
    if "DP" in list(sum_agg_fields) + list(int32_sum_agg_fields):
        logger.warning(
            "`DP` was included in site-level aggregation. This requires a densifying prior to running get_site_info_expr"
        )

    agg_expr = _get_info_agg_expr(
        mt=mt,
        sum_agg_fields=sum_agg_fields,
        int32_sum_agg_fields=int32_sum_agg_fields,
        median_agg_fields=median_agg_fields,
        array_sum_agg_fields=array_sum_agg_fields,
    )

    # Add FS and SOR if SB is present
    # This is done outside of _get_info_agg_expr as the behavior is different in site vs allele-specific versions
    if "SB" in agg_expr:
        agg_expr["FS"] = fs_from_sb(agg_expr["SB"])
        agg_expr["SOR"] = sor_from_sb(agg_expr["SB"])

    # Run aggregator on non-ref genotypes
    info = hl.agg.filter(
        mt.LGT.is_non_ref(),
        hl.struct(**{k: v
                     for k, v in agg_expr.items() if k != "DP"}),
    )

    # Add DP, computed over both ref and non-ref genotypes, if present
    if "DP" in agg_expr:
        info = info.annotate(DP=agg_expr["DP"])

    return info
Example #2
0
def get_as_info_expr(
    mt: hl.MatrixTable,
    sum_agg_fields: Union[List[str], Dict[
        str, hl.expr.NumericExpression]] = INFO_SUM_AGG_FIELDS,
    int32_sum_agg_fields: Union[List[str], Dict[
        str, hl.expr.NumericExpression]] = INFO_INT32_SUM_AGG_FIELDS,
    median_agg_fields: Union[List[str], Dict[
        str, hl.expr.NumericExpression]] = INFO_MEDIAN_AGG_FIELDS,
    array_sum_agg_fields: Union[List[str], Dict[
        str, hl.expr.ArrayNumericExpression]] = INFO_ARRAY_SUM_AGG_FIELDS,
    alt_alleles_range_array_field: str = "alt_alleles_range_array",
) -> hl.expr.StructExpression:
    """
    Return an allele-specific annotation Struct containing typical VCF INFO fields from GVCF INFO fields stored in the MT entries.

    .. note::

        - If `SB` is specified in array_sum_agg_fields, it will be aggregated as `AS_SB_TABLE`, according to GATK standard nomenclature.
        - If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for the `MQ` calculation and then dropped according to GATK recommendation.
        - If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation and then dropped according to GATK recommendation.
        - If the fields to be aggregate (`sum_agg_fields`, `int32_sum_agg_fields`, `median_agg_fields`) are passed as list of str,
          then they should correspond to entry fields in `mt` or in `mt.gvcf_info`.
        - Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in case of a name clash.

    :param mt: Input Matrix Table
    :param sum_agg_fields: Fields to aggregate using sum.
    :param int32_sum_agg_fields: Fields to aggregate using sum using int32.
    :param median_agg_fields: Fields to aggregate using (approximate) median.
    :param array_sum_agg_fields: Fields to aggregate using array sum.
    :param alt_alleles_range_array_field: Annotation containing an array of the range of alternate alleles e.g., `hl.range(1, hl.len(mt.alleles))`
    :return: Expression containing the AS info fields
    """
    if "DP" in list(sum_agg_fields) + list(int32_sum_agg_fields):
        logger.warning(
            "`DP` was included in allele-specific aggregation, "
            "however `DP` is typically not aggregated by allele; `VarDP` is."
            "Note that the resulting `AS_DP` field will NOT include reference genotypes."
        )

    agg_expr = _get_info_agg_expr(
        mt=mt,
        sum_agg_fields=sum_agg_fields,
        int32_sum_agg_fields=int32_sum_agg_fields,
        median_agg_fields=median_agg_fields,
        array_sum_agg_fields=array_sum_agg_fields,
        prefix="AS_",
    )

    # Rename AS_SB to AS_SB_TABLE if present
    if "AS_SB" in agg_expr:
        agg_expr["AS_SB_TABLE"] = agg_expr.pop("AS_SB")

    if alt_alleles_range_array_field not in mt.row or mt[
            alt_alleles_range_array_field].dtype != hl.dtype("array<int32>"):
        msg = f"'get_as_info_expr' expected a row field '{alt_alleles_range_array_field}' of type array<int32>"
        logger.error(msg)
        raise ValueError(msg)

    # Modify aggregations to aggregate per allele
    agg_expr = {
        f: hl.agg.array_agg(
            lambda ai: hl.agg.filter(mt.LA.contains(ai), expr),
            mt[alt_alleles_range_array_field],
        )
        for f, expr in agg_expr.items()
    }

    # Run aggregations
    info = hl.struct(**agg_expr)

    # Add SB Ax2 aggregation logic and FS if SB is present
    if "AS_SB_TABLE" in info:
        as_sb_table = hl.array([
            info.AS_SB_TABLE.filter(lambda x: hl.is_defined(x)).fold(
                lambda i, j: i[:2] + j[:2], [0, 0])  # ref
        ]).extend(info.AS_SB_TABLE.map(lambda x: x[2:])  # each alt
                  )
        info = info.annotate(
            AS_SB_TABLE=as_sb_table,
            AS_FS=hl.range(1, hl.len(mt.alleles)).map(
                lambda i: fs_from_sb(as_sb_table[0].extend(as_sb_table[i]))),
            AS_SOR=hl.range(1, hl.len(mt.alleles)).map(
                lambda i: sor_from_sb(as_sb_table[0].extend(as_sb_table[i]))),
        )

    return info