Example #1
0
def _extract_fn(data_extractors: pipeline_dp.DataExtractors,
                row: DataType) -> DataType:
    """Extracts the columns to (pid, pkey, pvalue).

    Args:
      data_extractors: A function to extract privacy_id, partition_key, value of
        the input data.
      row: The data to extract, should usually be raw input of the pipline.

    Returns:
      Data in format of (pid, pkey, pvalue) defined by the extractors.
    """
    return data_extractors.privacy_id_extractor(
        row), data_extractors.partition_extractor(
            row), data_extractors.value_extractor(row)
 def setUpClass(cls):
     conf = pyspark.SparkConf()
     cls.sc = pyspark.SparkContext(conf=conf)
     cls.data_extractors = DataExtractors(
         partition_extractor=lambda x: x[1],
         privacy_id_extractor=lambda x: x[0],
         value_extractor=lambda x: x[2])
Example #3
0
 def setUpClass(cls):
     import pyspark
     conf = pyspark.SparkConf()
     cls.sc = pyspark.SparkContext.getOrCreate(conf=conf)
     cls.data_extractors = DataExtractors(
         partition_extractor=lambda x: x[1],
         privacy_id_extractor=lambda x: x[0],
         value_extractor=lambda x: x[2])
     cls.backend = SparkRDDBackend(cls.sc)
Example #4
0
    def aggregate_true(
            self, col, params: SampleParams,
            data_extractors: pipeline_dp.DataExtractors) -> DataType:
        """Computes raw aggregation results of the input data without adding noises.

        Aggregation means aggregate values group by partition_key. Both values and
        partition_key are extracted by data extractors.

        Args:
          input_data: The data to sample. It can be local data, beam PCollection or
            Spark RDD depending on the engine used.
          data_extractors: A function to extract privacy_id, partition_key, value of
            the input data.

        Returns:
          True aggregation results.
        """
        combiner = non_private_combiners.create_compound_combiner(
            metrics=params.metrics)

        col = self._be.map(
            col, lambda row: (data_extractors.privacy_id_extractor(row),
                              data_extractors.partition_extractor(row),
                              data_extractors.value_extractor(row)),
            "Extract (privacy_id, partition_key, value))")
        # col : (privacy_id, partition_key, value)
        col = self._be.map_tuple(
            col, lambda pid, pk, v: ((pid, pk), v),
            "Rekey to ( (privacy_id, partition_key), value))")
        col = self._be.group_by_key(col, "Group by pk")
        col = self._be.map_values(col, combiner.create_accumulator,
                                  "Aggregate by (pk, pid)")
        # ((privacy_id, partition_key), aggregator)
        col = self._be.map_tuple(col, lambda pid_pk, v: (pid_pk[1], v),
                                 "Drop privacy id")
        # col : (partition_key, accumulator)
        col = self._be.combine_accumulators_per_key(
            col, combiner, "Reduce accumulators per partition key")
        # col : (partition_key, accumulator)
        # Compute metrics.
        col = self._be.map_values(col, combiner.compute_metrics,
                                  "Compute DP metrics")
        # col : (partition_key, aggregated_value)
        return col
Example #5
0
 def setUpClass(cls):
     cls.ops = MultiProcLocalPipelineOperations(n_jobs=1)
     cls.data_extractors = DataExtractors(
         partition_extractor=cls.partition_extract,
         privacy_id_extractor=cls.privacy_id_extract,
         value_extractor=cls.value_extract)
Example #6
0
 def setUpClass(cls):
     cls.ops = BeamOperations()
     cls.data_extractors = DataExtractors(
         partition_extractor=lambda x: x[1],
         privacy_id_extractor=lambda x: x[0],
         value_extractor=lambda x: x[2])
Example #7
0
 def setUpClass(cls):
     cls.backend = MultiProcLocalBackend(n_jobs=1)
     cls.data_extractors = DataExtractors(
         partition_extractor=cls.partition_extract,
         privacy_id_extractor=cls.privacy_id_extract,
         value_extractor=cls.value_extract)
Example #8
0
 def setUpClass(cls):
     cls.backend = LocalBackend()
     cls.data_extractors = DataExtractors(
         partition_extractor=lambda x: x[1],
         privacy_id_extractor=lambda x: x[0],
         value_extractor=lambda x: x[2])