def _get_common_variables(dataset, force_tf_compat_v1): """Returns metadata schema, preprocessing fn, input dataset metadata.""" tf_metadata_schema = benchmark_utils.read_schema( dataset.tf_metadata_schema_path()) preprocessing_fn = dataset.tft_preprocessing_fn() feature_spec = schema_utils.schema_as_feature_spec( tf_metadata_schema).feature_spec type_spec = impl_helper.get_type_specs_from_feature_specs(feature_spec) transform_input_columns = ( tft.get_transform_input_columns( preprocessing_fn, type_spec, force_tf_compat_v1=force_tf_compat_v1)) transform_input_dataset_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec({ feature: feature_spec[feature] for feature in transform_input_columns })) tfxio = tf_example_record.TFExampleBeamRecord( physical_format="tfexamples", schema=transform_input_dataset_metadata.schema, telemetry_descriptors=["TFTransformBenchmark"]) return CommonVariablesTuple( tf_metadata_schema=tf_metadata_schema, preprocessing_fn=preprocessing_fn, transform_input_dataset_metadata=transform_input_dataset_metadata, tfxio=tfxio)
def benchmarkMiniPipeline(self): """Benchmark a "mini" TFMA - predict, slice and compute metrics. Runs a "mini" version of TFMA in a Beam pipeline. Records the wall time taken for the whole pipeline. """ self._init_model() pipeline = self._create_beam_pipeline() tfx_io = test_util.InMemoryTFExampleRecord( schema=benchmark_utils.read_schema( self._dataset.tf_metadata_schema_path()), raw_record_column_name=constants.ARROW_INPUT_COLUMN) raw_data = ( pipeline | "Examples" >> beam.Create( self._dataset.read_raw_dataset( deserialize=False, limit=MAX_NUM_EXAMPLES)) | "BatchExamples" >> tfx_io.BeamSource() | "InputsToExtracts" >> tfma.BatchedInputsToExtracts()) _ = ( raw_data | "FeaturesExtractor" >> features_extractor.FeaturesExtractor( eval_config=self._eval_config).ptransform | "LabelsExtractor" >> labels_extractor.LabelsExtractor( eval_config=self._eval_config).ptransform | "ExampleWeightsExtractor" >> example_weights_extractor .ExampleWeightsExtractor(eval_config=self._eval_config).ptransform | "PredictionsExtractor" >> predictions_extractor.PredictionsExtractor( eval_config=self._eval_config, eval_shared_model=self._eval_shared_model).ptransform | "UnbatchExtractor" >> unbatch_extractor.UnbatchExtractor().ptransform | "SliceKeyExtractor" >> tfma.extractors.SliceKeyExtractor().ptransform | "ComputeMetricsPlotsAndValidations" >> metrics_plots_and_validations_evaluator .MetricsPlotsAndValidationsEvaluator( eval_config=self._eval_config, eval_shared_model=self._eval_shared_model).ptransform) start = time.time() result = pipeline.run() result.wait_until_finish() end = time.time() delta = end - start self.report_benchmark( iters=1, wall_time=delta, extras={ "num_examples": self._dataset.num_examples(limit=MAX_NUM_EXAMPLES) })
def benchmarkMiniPipelineBatched(self): """Benchmark a batched "mini" TFMA - predict, slice and compute metrics. Runs a "mini" version of TFMA in a Beam pipeline. Records the wall time taken for the whole pipeline. """ self._init_model() pipeline = beam.Pipeline(runner=fn_api_runner.FnApiRunner()) tfx_io = test_util.InMemoryTFExampleRecord( schema=benchmark_utils.read_schema( self._dataset.tf_metadata_schema_path()), raw_record_column_name=tfma.BATCHED_INPUT_KEY) raw_data = ( pipeline | "Examples" >> beam.Create( self._dataset.read_raw_dataset( deserialize=False, limit=MAX_NUM_EXAMPLES)) | "BatchExamples" >> tfx_io.BeamSource() | "InputsToExtracts" >> tfma.BatchedInputsToExtracts()) _ = ( raw_data | "BatchedInputExtractor" >> batched_input_extractor .BatchedInputExtractor(eval_config=self._eval_config).ptransform | "V2BatchedPredictExtractor" >> batched_predict_extractor_v2.BatchedPredictExtractor( eval_config=self._eval_config, eval_shared_model=self._eval_shared_model).ptransform | "UnbatchExtractor" >> unbatch_extractor.UnbatchExtractor().ptransform | "SliceKeyExtractor" >> tfma.extractors.SliceKeyExtractor().ptransform | "V2ComputeMetricsAndPlots" >> metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=self._eval_config, eval_shared_model=self._eval_shared_model).ptransform) start = time.time() result = pipeline.run() result.wait_until_finish() end = time.time() delta = end - start self.report_benchmark( iters=1, wall_time=delta, extras={ "num_examples": self._dataset.num_examples(limit=MAX_NUM_EXAMPLES) })
def _readDatasetIntoBatchedExtracts(self): """Read the raw dataset and massage examples into batched Extracts.""" serialized_examples = list( self._dataset.read_raw_dataset(deserialize=False, limit=self._max_num_examples())) # TODO(b/153996019): Once the TFXIO interface that returns an iterator of # RecordBatch is available, clean this up. coder = example_coder.ExamplesToRecordBatchDecoder( serialized_schema=benchmark_utils.read_schema( self._dataset.tf_metadata_schema_path()).SerializeToString()) batches = [] for i in range(0, len(serialized_examples), _BATCH_SIZE): example_batch = serialized_examples[i:i + _BATCH_SIZE] record_batch = record_based_tfxio.AppendRawRecordColumn( coder.DecodeBatch(example_batch), constants.ARROW_INPUT_COLUMN, example_batch) batches.append({constants.ARROW_RECORD_BATCH_KEY: record_batch}) return batches
def _get_common_variables(dataset): """Returns metadata schema, preprocessing fn, input dataset metadata.""" tf_metadata_schema = benchmark_utils.read_schema( dataset.tf_metadata_schema_path()) preprocessing_fn = dataset.tft_preprocessing_fn() feature_spec = schema_utils.schema_as_feature_spec( tf_metadata_schema).feature_spec transform_input_columns = ( tft.get_transform_input_columns(preprocessing_fn, feature_spec)) transform_input_dataset_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec({ feature: feature_spec[feature] for feature in transform_input_columns })) return CommonVariablesTuple( tf_metadata_schema=tf_metadata_schema, preprocessing_fn=preprocessing_fn, transform_input_dataset_metadata=transform_input_dataset_metadata)
def _runMiniPipeline(self, multi_model): """Benchmark a "mini" TFMA - predict, slice and compute metrics. Runs a "mini" version of TFMA in a Beam pipeline. Records the wall time taken for the whole pipeline. Args: multi_model: True if multiple models should be used in the benchmark. """ self._init_model(multi_model, validation=False) pipeline = self._create_beam_pipeline() tfx_io = test_util.InMemoryTFExampleRecord( schema=benchmark_utils.read_schema( self._dataset.tf_metadata_schema_path()), raw_record_column_name=constants.ARROW_INPUT_COLUMN) raw_data = (pipeline | "Examples" >> beam.Create( self._dataset.read_raw_dataset( deserialize=False, limit=self._max_num_examples())) | "BatchExamples" >> tfx_io.BeamSource() | "InputsToExtracts" >> tfma.BatchedInputsToExtracts()) def rescale_labels(extracts): # Transform labels to [0, 1] so we can test metrics that require labels in # that range. result = copy.copy(extracts) result[constants.LABELS_KEY] = self._transform_labels( extracts[constants.LABELS_KEY]) return result _ = (raw_data | "FeaturesExtractor" >> features_extractor.FeaturesExtractor( eval_config=self._eval_config).ptransform | "LabelsExtractor" >> labels_extractor.LabelsExtractor( eval_config=self._eval_config).ptransform | "RescaleLabels" >> beam.Map(rescale_labels) | "ExampleWeightsExtractor" >> example_weights_extractor. ExampleWeightsExtractor(eval_config=self._eval_config).ptransform | "PredictionsExtractor" >> predictions_extractor.PredictionsExtractor( eval_config=self._eval_config, eval_shared_model=self._eval_shared_models).ptransform | "UnbatchExtractor" >> unbatch_extractor.UnbatchExtractor().ptransform | "SliceKeyExtractor" >> tfma.extractors.SliceKeyExtractor().ptransform | "ComputeMetricsPlotsAndValidations" >> metrics_plots_and_validations_evaluator. MetricsPlotsAndValidationsEvaluator( eval_config=self._eval_config, eval_shared_model=self._eval_shared_models).ptransform) start = time.time() for _ in range(_ITERS): result = pipeline.run() result.wait_until_finish() end = time.time() delta = end - start self.report_benchmark( iters=_ITERS, wall_time=delta, extras={ "num_examples": self._dataset.num_examples(limit=self._max_num_examples()) })