def get_site_info_expr( mt: hl.MatrixTable, sum_agg_fields: Union[ List[str], Dict[str, hl.expr.NumericExpression] ] = INFO_SUM_AGG_FIELDS, int32_sum_agg_fields: Union[ List[str], Dict[str, hl.expr.NumericExpression] ] = INFO_INT32_SUM_AGG_FIELDS, median_agg_fields: Union[ List[str], Dict[str, hl.expr.NumericExpression] ] = INFO_MEDIAN_AGG_FIELDS, array_sum_agg_fields: Union[ List[str], Dict[str, hl.expr.ArrayNumericExpression] ] = INFO_ARRAY_SUM_AGG_FIELDS, ) -> hl.expr.StructExpression: """ Creates a site-level annotation Struct aggregating typical VCF INFO fields from GVCF INFO fields stored in the MT entries. Notes: 1. If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for the `MQ` calculation and then dropped according to GATK recommendation. 2. If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation and then dropped according to GATK recommendation. 3. If the fields to be aggregate (`sum_agg_fields`, `int32_sum_agg_fields`, `median_agg_fields`) are passed as list of str, then they should correspond to entry fields in `mt` or in `mt.gvcf_info`. Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in case of a name clash. :param mt: Input Matrix Table :param sum_agg_fields: Fields to aggregate using sum. :param int32_sum_agg_fields: Fields to aggregate using sum using int32. :param median_agg_fields: Fields to aggregate using (approximate) median. :return: Expression containing the site-level info fields """ if "DP" in list(sum_agg_fields) + list(int32_sum_agg_fields): logger.warning( "`DP` was included in site-level aggregation. This requires a densifying prior to running get_site_info_expr" ) agg_expr = _get_info_agg_expr( mt=mt, sum_agg_fields=sum_agg_fields, int32_sum_agg_fields=int32_sum_agg_fields, median_agg_fields=median_agg_fields, array_sum_agg_fields=array_sum_agg_fields, ) # Add FS if SB is present # This is done outside of _get_info_agg_expr as the behavior is different in site vs allele-specific versions agg_expr["FS"] = fs_from_sb(agg_expr["SB"]) # Run aggregator on non-ref genotypes info = hl.agg.filter( mt.LGT.is_non_ref(), hl.struct(**{k: v for k, v in agg_expr.items() if k != "DP"}), ) # Add DP, computed over both ref and non-ref genotypes, if present if "DP" in agg_expr: info = info.annotate(DP=agg_expr["DP"]) return info
def get_as_info_expr( mt: hl.MatrixTable, sum_agg_fields: Union[ List[str], Dict[str, hl.expr.NumericExpression] ] = INFO_SUM_AGG_FIELDS, int32_sum_agg_fields: Union[ List[str], Dict[str, hl.expr.NumericExpression] ] = INFO_INT32_SUM_AGG_FIELDS, median_agg_fields: Union[ List[str], Dict[str, hl.expr.NumericExpression] ] = INFO_MEDIAN_AGG_FIELDS, array_sum_agg_fields: Union[ List[str], Dict[str, hl.expr.ArrayNumericExpression] ] = INFO_ARRAY_SUM_AGG_FIELDS, ) -> hl.expr.StructExpression: """ Returns an allele-specific annotation Struct containing typical VCF INFO fields from GVCF INFO fields stored in the MT entries. Notes: 1. If `SB` is specified in array_sum_agg_fields, it will be aggregated as `AS_SB_TABLE`, according to GATK standard nomenclature. 2. If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for the `MQ` calculation and then dropped according to GATK recommendation. 3. If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation and then dropped according to GATK recommendation. 4. If the fields to be aggregate (`sum_agg_fields`, `int32_sum_agg_fields`, `median_agg_fields`) are passed as list of str, then they should correspond to entry fields in `mt` or in `mt.gvcf_info`. Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in case of a name clash. :param mt: Input Matrix Table :param sum_agg_fields: Fields to aggregate using sum. :param int32_sum_agg_fields: Fields to aggregate using sum using int32. :param median_agg_fields: Fields to aggregate using (approximate) median. :return: Expression containing the AS info fields """ if "DP" in list(sum_agg_fields) + list(int32_sum_agg_fields): logger.warning( "`DP` was included in allele-specific aggregation, " "however `DP` is typically not aggregated by allele; `VarDP` is." "Note that the resulting `AS_DP` field will NOT include reference genotypes." ) agg_expr = _get_info_agg_expr( mt=mt, sum_agg_fields=sum_agg_fields, int32_sum_agg_fields=int32_sum_agg_fields, median_agg_fields=median_agg_fields, array_sum_agg_fields=array_sum_agg_fields, prefix="AS_", ) # Rename AS_SB to AS_SB_TABLE if present if "AS_SB" in agg_expr: agg_expr["AS_SB_TABLE"] = agg_expr.pop("AS_SB") # Modify aggregations to aggregate per allele agg_expr = { f: hl.agg.array_agg( lambda ai: hl.agg.filter(mt.LA.contains(ai), expr), hl.range(1, hl.len(mt.alleles)), ) for f, expr in agg_expr.items() } # Run aggregations info = hl.struct(**agg_expr) # Add SB Ax2 aggregation logic and FS if SB is present if "AS_SB_TABLE" in info: as_sb_table = hl.array( [ info.AS_SB_TABLE.filter(lambda x: hl.is_defined(x)).fold( lambda i, j: i[:2] + j[:2], [0, 0] ) # ref ] ).extend( info.AS_SB_TABLE.map(lambda x: x[2:]) # each alt ) info = info.annotate( AS_SB_TABLE=as_sb_table, AS_FS=hl.range(1, hl.len(mt.alleles)).map( lambda i: fs_from_sb(as_sb_table[0].extend(as_sb_table[i])) ), ) return info