Esempio n. 1
0
def _get_common_variables(dataset, force_tf_compat_v1):
  """Returns metadata schema, preprocessing fn, input dataset metadata."""

  tf_metadata_schema = benchmark_utils.read_schema(
      dataset.tf_metadata_schema_path())

  preprocessing_fn = dataset.tft_preprocessing_fn()

  feature_spec = schema_utils.schema_as_feature_spec(
      tf_metadata_schema).feature_spec
  type_spec = impl_helper.get_type_specs_from_feature_specs(feature_spec)
  transform_input_columns = (
      tft.get_transform_input_columns(
          preprocessing_fn, type_spec, force_tf_compat_v1=force_tf_compat_v1))
  transform_input_dataset_metadata = dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec({
          feature: feature_spec[feature] for feature in transform_input_columns
      }))
  tfxio = tf_example_record.TFExampleBeamRecord(
      physical_format="tfexamples",
      schema=transform_input_dataset_metadata.schema,
      telemetry_descriptors=["TFTransformBenchmark"])

  return CommonVariablesTuple(
      tf_metadata_schema=tf_metadata_schema,
      preprocessing_fn=preprocessing_fn,
      transform_input_dataset_metadata=transform_input_dataset_metadata,
      tfxio=tfxio)
Esempio n. 2
0
  def benchmarkMiniPipeline(self):
    """Benchmark a "mini" TFMA - predict, slice and compute metrics.

    Runs a "mini" version of TFMA in a Beam pipeline. Records the wall time
    taken for the whole pipeline.
    """
    self._init_model()
    pipeline = self._create_beam_pipeline()
    tfx_io = test_util.InMemoryTFExampleRecord(
        schema=benchmark_utils.read_schema(
            self._dataset.tf_metadata_schema_path()),
        raw_record_column_name=constants.ARROW_INPUT_COLUMN)
    raw_data = (
        pipeline
        | "Examples" >> beam.Create(
            self._dataset.read_raw_dataset(
                deserialize=False, limit=MAX_NUM_EXAMPLES))
        | "BatchExamples" >> tfx_io.BeamSource()
        | "InputsToExtracts" >> tfma.BatchedInputsToExtracts())

    _ = (
        raw_data
        | "FeaturesExtractor" >> features_extractor.FeaturesExtractor(
            eval_config=self._eval_config).ptransform
        | "LabelsExtractor" >> labels_extractor.LabelsExtractor(
            eval_config=self._eval_config).ptransform
        | "ExampleWeightsExtractor" >> example_weights_extractor
        .ExampleWeightsExtractor(eval_config=self._eval_config).ptransform
        | "PredictionsExtractor" >> predictions_extractor.PredictionsExtractor(
            eval_config=self._eval_config,
            eval_shared_model=self._eval_shared_model).ptransform
        | "UnbatchExtractor" >> unbatch_extractor.UnbatchExtractor().ptransform
        | "SliceKeyExtractor" >> tfma.extractors.SliceKeyExtractor().ptransform
        | "ComputeMetricsPlotsAndValidations" >>
        metrics_plots_and_validations_evaluator
        .MetricsPlotsAndValidationsEvaluator(
            eval_config=self._eval_config,
            eval_shared_model=self._eval_shared_model).ptransform)

    start = time.time()
    result = pipeline.run()
    result.wait_until_finish()
    end = time.time()
    delta = end - start

    self.report_benchmark(
        iters=1,
        wall_time=delta,
        extras={
            "num_examples": self._dataset.num_examples(limit=MAX_NUM_EXAMPLES)
        })
Esempio n. 3
0
  def benchmarkMiniPipelineBatched(self):
    """Benchmark a batched "mini" TFMA - predict, slice and compute metrics.

    Runs a "mini" version of TFMA in a Beam pipeline. Records the wall time
    taken for the whole pipeline.
    """
    self._init_model()
    pipeline = beam.Pipeline(runner=fn_api_runner.FnApiRunner())
    tfx_io = test_util.InMemoryTFExampleRecord(
        schema=benchmark_utils.read_schema(
            self._dataset.tf_metadata_schema_path()),
        raw_record_column_name=tfma.BATCHED_INPUT_KEY)
    raw_data = (
        pipeline
        | "Examples" >> beam.Create(
            self._dataset.read_raw_dataset(
                deserialize=False, limit=MAX_NUM_EXAMPLES))
        | "BatchExamples" >> tfx_io.BeamSource()
        | "InputsToExtracts" >> tfma.BatchedInputsToExtracts())

    _ = (
        raw_data
        | "BatchedInputExtractor" >> batched_input_extractor
        .BatchedInputExtractor(eval_config=self._eval_config).ptransform
        | "V2BatchedPredictExtractor" >>
        batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_config=self._eval_config,
            eval_shared_model=self._eval_shared_model).ptransform
        | "UnbatchExtractor" >> unbatch_extractor.UnbatchExtractor().ptransform
        | "SliceKeyExtractor" >> tfma.extractors.SliceKeyExtractor().ptransform
        | "V2ComputeMetricsAndPlots" >>
        metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
            eval_config=self._eval_config,
            eval_shared_model=self._eval_shared_model).ptransform)

    start = time.time()
    result = pipeline.run()
    result.wait_until_finish()
    end = time.time()
    delta = end - start

    self.report_benchmark(
        iters=1,
        wall_time=delta,
        extras={
            "num_examples": self._dataset.num_examples(limit=MAX_NUM_EXAMPLES)
        })
Esempio n. 4
0
    def _readDatasetIntoBatchedExtracts(self):
        """Read the raw dataset and massage examples into batched Extracts."""
        serialized_examples = list(
            self._dataset.read_raw_dataset(deserialize=False,
                                           limit=self._max_num_examples()))

        # TODO(b/153996019): Once the TFXIO interface that returns an iterator of
        # RecordBatch is available, clean this up.
        coder = example_coder.ExamplesToRecordBatchDecoder(
            serialized_schema=benchmark_utils.read_schema(
                self._dataset.tf_metadata_schema_path()).SerializeToString())
        batches = []
        for i in range(0, len(serialized_examples), _BATCH_SIZE):
            example_batch = serialized_examples[i:i + _BATCH_SIZE]
            record_batch = record_based_tfxio.AppendRawRecordColumn(
                coder.DecodeBatch(example_batch), constants.ARROW_INPUT_COLUMN,
                example_batch)
            batches.append({constants.ARROW_RECORD_BATCH_KEY: record_batch})
        return batches
Esempio n. 5
0
def _get_common_variables(dataset):
  """Returns metadata schema, preprocessing fn, input dataset metadata."""

  tf_metadata_schema = benchmark_utils.read_schema(
      dataset.tf_metadata_schema_path())

  preprocessing_fn = dataset.tft_preprocessing_fn()

  feature_spec = schema_utils.schema_as_feature_spec(
      tf_metadata_schema).feature_spec
  transform_input_columns = (
      tft.get_transform_input_columns(preprocessing_fn, feature_spec))
  transform_input_dataset_metadata = dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec({
          feature: feature_spec[feature] for feature in transform_input_columns
      }))

  return CommonVariablesTuple(
      tf_metadata_schema=tf_metadata_schema,
      preprocessing_fn=preprocessing_fn,
      transform_input_dataset_metadata=transform_input_dataset_metadata)
Esempio n. 6
0
    def _runMiniPipeline(self, multi_model):
        """Benchmark a "mini" TFMA - predict, slice and compute metrics.

    Runs a "mini" version of TFMA in a Beam pipeline. Records the wall time
    taken for the whole pipeline.

    Args:
      multi_model: True if multiple models should be used in the benchmark.
    """
        self._init_model(multi_model, validation=False)
        pipeline = self._create_beam_pipeline()
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=benchmark_utils.read_schema(
                self._dataset.tf_metadata_schema_path()),
            raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        raw_data = (pipeline
                    | "Examples" >> beam.Create(
                        self._dataset.read_raw_dataset(
                            deserialize=False, limit=self._max_num_examples()))
                    | "BatchExamples" >> tfx_io.BeamSource()
                    | "InputsToExtracts" >> tfma.BatchedInputsToExtracts())

        def rescale_labels(extracts):
            # Transform labels to [0, 1] so we can test metrics that require labels in
            # that range.
            result = copy.copy(extracts)
            result[constants.LABELS_KEY] = self._transform_labels(
                extracts[constants.LABELS_KEY])
            return result

        _ = (raw_data
             | "FeaturesExtractor" >> features_extractor.FeaturesExtractor(
                 eval_config=self._eval_config).ptransform
             | "LabelsExtractor" >> labels_extractor.LabelsExtractor(
                 eval_config=self._eval_config).ptransform
             | "RescaleLabels" >> beam.Map(rescale_labels)
             | "ExampleWeightsExtractor" >> example_weights_extractor.
             ExampleWeightsExtractor(eval_config=self._eval_config).ptransform
             | "PredictionsExtractor" >>
             predictions_extractor.PredictionsExtractor(
                 eval_config=self._eval_config,
                 eval_shared_model=self._eval_shared_models).ptransform
             | "UnbatchExtractor" >>
             unbatch_extractor.UnbatchExtractor().ptransform
             | "SliceKeyExtractor" >>
             tfma.extractors.SliceKeyExtractor().ptransform
             | "ComputeMetricsPlotsAndValidations" >>
             metrics_plots_and_validations_evaluator.
             MetricsPlotsAndValidationsEvaluator(
                 eval_config=self._eval_config,
                 eval_shared_model=self._eval_shared_models).ptransform)

        start = time.time()
        for _ in range(_ITERS):
            result = pipeline.run()
            result.wait_until_finish()
        end = time.time()
        delta = end - start

        self.report_benchmark(
            iters=_ITERS,
            wall_time=delta,
            extras={
                "num_examples":
                self._dataset.num_examples(limit=self._max_num_examples())
            })