def _extract_fn(data_extractors: pipeline_dp.DataExtractors, row: DataType) -> DataType: """Extracts the columns to (pid, pkey, pvalue). Args: data_extractors: A function to extract privacy_id, partition_key, value of the input data. row: The data to extract, should usually be raw input of the pipline. Returns: Data in format of (pid, pkey, pvalue) defined by the extractors. """ return data_extractors.privacy_id_extractor( row), data_extractors.partition_extractor( row), data_extractors.value_extractor(row)
def aggregate_true( self, col, params: SampleParams, data_extractors: pipeline_dp.DataExtractors) -> DataType: """Computes raw aggregation results of the input data without adding noises. Aggregation means aggregate values group by partition_key. Both values and partition_key are extracted by data extractors. Args: input_data: The data to sample. It can be local data, beam PCollection or Spark RDD depending on the engine used. data_extractors: A function to extract privacy_id, partition_key, value of the input data. Returns: True aggregation results. """ combiner = non_private_combiners.create_compound_combiner( metrics=params.metrics) col = self._be.map( col, lambda row: (data_extractors.privacy_id_extractor(row), data_extractors.partition_extractor(row), data_extractors.value_extractor(row)), "Extract (privacy_id, partition_key, value))") # col : (privacy_id, partition_key, value) col = self._be.map_tuple( col, lambda pid, pk, v: ((pid, pk), v), "Rekey to ( (privacy_id, partition_key), value))") col = self._be.group_by_key(col, "Group by pk") col = self._be.map_values(col, combiner.create_accumulator, "Aggregate by (pk, pid)") # ((privacy_id, partition_key), aggregator) col = self._be.map_tuple(col, lambda pid_pk, v: (pid_pk[1], v), "Drop privacy id") # col : (partition_key, accumulator) col = self._be.combine_accumulators_per_key( col, combiner, "Reduce accumulators per partition key") # col : (partition_key, accumulator) # Compute metrics. col = self._be.map_values(col, combiner.compute_metrics, "Compute DP metrics") # col : (partition_key, aggregated_value) return col