def get_site_info_expr( mt: hl.MatrixTable, sum_agg_fields: Union[List[str], Dict[ str, hl.expr.NumericExpression]] = INFO_SUM_AGG_FIELDS, int32_sum_agg_fields: Union[List[str], Dict[ str, hl.expr.NumericExpression]] = INFO_INT32_SUM_AGG_FIELDS, median_agg_fields: Union[List[str], Dict[ str, hl.expr.NumericExpression]] = INFO_MEDIAN_AGG_FIELDS, array_sum_agg_fields: Union[List[str], Dict[ str, hl.expr.ArrayNumericExpression]] = INFO_ARRAY_SUM_AGG_FIELDS, ) -> hl.expr.StructExpression: """ Create a site-level annotation Struct aggregating typical VCF INFO fields from GVCF INFO fields stored in the MT entries. .. note:: - If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for the `MQ` calculation and then dropped according to GATK recommendation. - If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation and then dropped according to GATK recommendation. - If the fields to be aggregate (`sum_agg_fields`, `int32_sum_agg_fields`, `median_agg_fields`) are passed as list of str, then they should correspond to entry fields in `mt` or in `mt.gvcf_info`. - Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in case of a name clash. :param mt: Input Matrix Table :param sum_agg_fields: Fields to aggregate using sum. :param int32_sum_agg_fields: Fields to aggregate using sum using int32. :param median_agg_fields: Fields to aggregate using (approximate) median. :return: Expression containing the site-level info fields """ if "DP" in list(sum_agg_fields) + list(int32_sum_agg_fields): logger.warning( "`DP` was included in site-level aggregation. This requires a densifying prior to running get_site_info_expr" ) agg_expr = _get_info_agg_expr( mt=mt, sum_agg_fields=sum_agg_fields, int32_sum_agg_fields=int32_sum_agg_fields, median_agg_fields=median_agg_fields, array_sum_agg_fields=array_sum_agg_fields, ) # Add FS and SOR if SB is present # This is done outside of _get_info_agg_expr as the behavior is different in site vs allele-specific versions if "SB" in agg_expr: agg_expr["FS"] = fs_from_sb(agg_expr["SB"]) agg_expr["SOR"] = sor_from_sb(agg_expr["SB"]) # Run aggregator on non-ref genotypes info = hl.agg.filter( mt.LGT.is_non_ref(), hl.struct(**{k: v for k, v in agg_expr.items() if k != "DP"}), ) # Add DP, computed over both ref and non-ref genotypes, if present if "DP" in agg_expr: info = info.annotate(DP=agg_expr["DP"]) return info
def get_as_info_expr( mt: hl.MatrixTable, sum_agg_fields: Union[List[str], Dict[ str, hl.expr.NumericExpression]] = INFO_SUM_AGG_FIELDS, int32_sum_agg_fields: Union[List[str], Dict[ str, hl.expr.NumericExpression]] = INFO_INT32_SUM_AGG_FIELDS, median_agg_fields: Union[List[str], Dict[ str, hl.expr.NumericExpression]] = INFO_MEDIAN_AGG_FIELDS, array_sum_agg_fields: Union[List[str], Dict[ str, hl.expr.ArrayNumericExpression]] = INFO_ARRAY_SUM_AGG_FIELDS, alt_alleles_range_array_field: str = "alt_alleles_range_array", ) -> hl.expr.StructExpression: """ Return an allele-specific annotation Struct containing typical VCF INFO fields from GVCF INFO fields stored in the MT entries. .. note:: - If `SB` is specified in array_sum_agg_fields, it will be aggregated as `AS_SB_TABLE`, according to GATK standard nomenclature. - If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for the `MQ` calculation and then dropped according to GATK recommendation. - If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation and then dropped according to GATK recommendation. - If the fields to be aggregate (`sum_agg_fields`, `int32_sum_agg_fields`, `median_agg_fields`) are passed as list of str, then they should correspond to entry fields in `mt` or in `mt.gvcf_info`. - Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in case of a name clash. :param mt: Input Matrix Table :param sum_agg_fields: Fields to aggregate using sum. :param int32_sum_agg_fields: Fields to aggregate using sum using int32. :param median_agg_fields: Fields to aggregate using (approximate) median. :param array_sum_agg_fields: Fields to aggregate using array sum. :param alt_alleles_range_array_field: Annotation containing an array of the range of alternate alleles e.g., `hl.range(1, hl.len(mt.alleles))` :return: Expression containing the AS info fields """ if "DP" in list(sum_agg_fields) + list(int32_sum_agg_fields): logger.warning( "`DP` was included in allele-specific aggregation, " "however `DP` is typically not aggregated by allele; `VarDP` is." "Note that the resulting `AS_DP` field will NOT include reference genotypes." ) agg_expr = _get_info_agg_expr( mt=mt, sum_agg_fields=sum_agg_fields, int32_sum_agg_fields=int32_sum_agg_fields, median_agg_fields=median_agg_fields, array_sum_agg_fields=array_sum_agg_fields, prefix="AS_", ) # Rename AS_SB to AS_SB_TABLE if present if "AS_SB" in agg_expr: agg_expr["AS_SB_TABLE"] = agg_expr.pop("AS_SB") if alt_alleles_range_array_field not in mt.row or mt[ alt_alleles_range_array_field].dtype != hl.dtype("array<int32>"): msg = f"'get_as_info_expr' expected a row field '{alt_alleles_range_array_field}' of type array<int32>" logger.error(msg) raise ValueError(msg) # Modify aggregations to aggregate per allele agg_expr = { f: hl.agg.array_agg( lambda ai: hl.agg.filter(mt.LA.contains(ai), expr), mt[alt_alleles_range_array_field], ) for f, expr in agg_expr.items() } # Run aggregations info = hl.struct(**agg_expr) # Add SB Ax2 aggregation logic and FS if SB is present if "AS_SB_TABLE" in info: as_sb_table = hl.array([ info.AS_SB_TABLE.filter(lambda x: hl.is_defined(x)).fold( lambda i, j: i[:2] + j[:2], [0, 0]) # ref ]).extend(info.AS_SB_TABLE.map(lambda x: x[2:]) # each alt ) info = info.annotate( AS_SB_TABLE=as_sb_table, AS_FS=hl.range(1, hl.len(mt.alleles)).map( lambda i: fs_from_sb(as_sb_table[0].extend(as_sb_table[i]))), AS_SOR=hl.range(1, hl.len(mt.alleles)).map( lambda i: sor_from_sb(as_sb_table[0].extend(as_sb_table[i]))), ) return info