Esempio n. 1
0
def _get_common_variables(dataset, force_tf_compat_v1):
  """Returns metadata schema, preprocessing fn, input dataset metadata."""

  tf_metadata_schema = benchmark_utils.read_schema(
      dataset.tf_metadata_schema_path())

  preprocessing_fn = dataset.tft_preprocessing_fn()

  feature_spec = schema_utils.schema_as_feature_spec(
      tf_metadata_schema).feature_spec
  type_spec = impl_helper.get_type_specs_from_feature_specs(feature_spec)
  transform_input_columns = (
      tft.get_transform_input_columns(
          preprocessing_fn, type_spec, force_tf_compat_v1=force_tf_compat_v1))
  transform_input_dataset_metadata = dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec({
          feature: feature_spec[feature] for feature in transform_input_columns
      }))
  tfxio = tf_example_record.TFExampleBeamRecord(
      physical_format="tfexamples",
      schema=transform_input_dataset_metadata.schema,
      telemetry_descriptors=["TFTransformBenchmark"])

  return CommonVariablesTuple(
      tf_metadata_schema=tf_metadata_schema,
      preprocessing_fn=preprocessing_fn,
      transform_input_dataset_metadata=transform_input_dataset_metadata,
      tfxio=tfxio)
    def test_features_extractor_no_features(self):
        model_spec = config_pb2.ModelSpec()
        eval_config = config_pb2.EvalConfig(model_specs=[model_spec])
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        tfx_io = tf_example_record.TFExampleBeamRecord(
            raw_record_column_name=constants.ARROW_INPUT_COLUMN,
            physical_format='inmem',
            telemetry_descriptors=['testing'])

        with beam.Pipeline() as pipeline:
            result = (
                pipeline | 'Create' >> beam.Create([b''] * 3)
                | 'DecodeToRecordBatch' >> tfx_io.BeamSource(batch_size=3)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform)

            def check_result(got):
                self.assertLen(got, 1)
                self.assertIn(constants.FEATURES_KEY, got[0])
                self.assertEmpty(got[0][constants.FEATURES_KEY])
                self.assertIn(constants.INPUT_KEY, got[0])
                self.assertLen(got[0][constants.INPUT_KEY], 3)

            util.assert_that(result, check_result, label='CheckResult')
    def testE2E(self):
        raw_record_column_name = "raw_record"
        tfxio = tf_example_record.TFExampleBeamRecord(
            physical_format="inmem",
            telemetry_descriptors=["some", "component"],
            schema=_SCHEMA,
            raw_record_column_name=raw_record_column_name,
        )

        def _AssertFn(record_batches):
            self.assertLen(record_batches, 1)
            record_batch = record_batches[0]
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            tensor_adapter = tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 3)
            self.assertIn("int_feature", dict_of_tensors)
            self.assertIn("float_feature", dict_of_tensors)
            self.assertIn("string_feature", dict_of_tensors)

        with beam.Pipeline() as p:
            record_batch_pcoll = (
                p
                | "CreateInMemRecords" >> beam.Create(_SERIALIZED_EXAMPLES)
                | "BeamSource" >>
                tfxio.BeamSource(batch_size=len(_SERIALIZED_EXAMPLES)))
            beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
Esempio n. 4
0
    def testModelSignaturesDoFn(self, save_as_keras, signature_names,
                                default_signature_names, prefer_dict_outputs,
                                use_schema, expected_num_outputs):
        export_path = self.createModelWithMultipleDenseInputs(save_as_keras)
        eval_shared_models = {}
        model_specs = []
        for sigs in signature_names.values():
            for model_name in sigs:
                if model_name not in eval_shared_models:
                    eval_shared_models[
                        model_name] = self.createTestEvalSharedModel(
                            eval_saved_model_path=export_path,
                            model_name=model_name,
                            tags=[tf.saved_model.SERVING])
                    model_specs.append(config_pb2.ModelSpec(name=model_name))
        eval_config = config_pb2.EvalConfig(model_specs=model_specs)
        schema = self.createDenseInputsSchema() if use_schema else None
        tfx_io = tf_example_record.TFExampleBeamRecord(
            physical_format='text',
            schema=schema,
            raw_record_column_name=constants.ARROW_INPUT_COLUMN)

        examples = [
            self._makeExample(input_1=1.0, input_2=2.0),
            self._makeExample(input_1=3.0, input_2=4.0),
            self._makeExample(input_1=5.0, input_2=6.0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (pipeline
                      | 'Create' >> beam.Create(
                          [e.SerializeToString() for e in examples])
                      | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3)
                      | 'ToExtracts' >> beam.Map(_record_batch_to_extracts)
                      | 'ModelSignatures' >> beam.ParDo(
                          model_util.ModelSignaturesDoFn(
                              eval_config=eval_config,
                              eval_shared_models=eval_shared_models,
                              signature_names=signature_names,
                              default_signature_names=default_signature_names,
                              prefer_dict_outputs=prefer_dict_outputs)))

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    for key in signature_names:
                        self.assertIn(key, got[0])
                        if prefer_dict_outputs:
                            self.assertIsInstance(got[0][key], dict)
                            self.assertEqual(tfma_util.batch_size(got[0][key]),
                                             expected_num_outputs)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Esempio n. 5
0
    def testSqlSliceKeyExtractorWithEmptySqlConfig(self):
        eval_config = config_pb2.EvalConfig()
        feature_extractor = features_extractor.FeaturesExtractor(
            eval_config=eval_config)
        slice_key_extractor = sql_slice_key_extractor.SqlSliceKeyExtractor(
            eval_config)

        tfx_io = tf_example_record.TFExampleBeamRecord(
            physical_format='inmem',
            telemetry_descriptors=['test', 'component'],
            schema=_SCHEMA,
            raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        examples = [
            self._makeExample(fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string2'),
            self._makeExample(fixed_int=2,
                              fixed_float=0.0,
                              fixed_string='fixed_string3')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | slice_key_extractor.stage_name >>
                slice_key_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    np.testing.assert_equal(
                        got[0][constants.SLICE_KEY_TYPES_KEY],
                        types.VarLenTensorValue.from_dense_rows(
                            [np.array([]),
                             np.array([]),
                             np.array([])]))

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result)
Esempio n. 6
0
  def testSqlSliceKeyExtractor(self):
    eval_config = config_pb2.EvalConfig(slicing_specs=[
        config_pb2.SlicingSpec(slice_keys_sql="""
        SELECT
          STRUCT(fixed_string)
        FROM
          example.fixed_string,
          example.fixed_int
        WHERE fixed_int = 1
        """)
    ])
    slice_key_extractor = sql_slice_key_extractor.SqlSliceKeyExtractor(
        eval_config)

    tfx_io = tf_example_record.TFExampleBeamRecord(
        physical_format='inmem',
        telemetry_descriptors=['test', 'component'],
        schema=_SCHEMA,
        raw_record_column_name=constants.ARROW_INPUT_COLUMN)
    examples = [
        self._makeExample(
            fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'),
        self._makeExample(
            fixed_int=1, fixed_float=1.0, fixed_string='fixed_string2'),
        self._makeExample(
            fixed_int=2, fixed_float=0.0, fixed_string='fixed_string3')
    ]

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples],
                                    reshuffle=False)
          | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3)
          | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
          | slice_key_extractor.stage_name >> slice_key_extractor.ptransform)

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          self.assertEqual(got[0][constants.SLICE_KEY_TYPES_KEY],
                           [[(('fixed_string', 'fixed_string1'),)],
                            [(('fixed_string', 'fixed_string2'),)], []])

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result)
    def testModelSignaturesDoFnError(self):
        export_path = self.createModelWithInvalidOutputShape()
        signature_names = {constants.PREDICTIONS_KEY: {'': [None]}}
        eval_shared_models = {
            '':
            self.createTestEvalSharedModel(eval_saved_model_path=export_path,
                                           tags=[tf.saved_model.SERVING])
        }
        model_specs = [config_pb2.ModelSpec()]
        eval_config = config_pb2.EvalConfig(model_specs=model_specs)
        schema = self.createDenseInputsSchema()
        tfx_io = tf_example_record.TFExampleBeamRecord(
            physical_format='text',
            schema=schema,
            raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())

        examples = [
            self._makeExample(input_1=1.0, input_2=2.0),
            self._makeExample(input_1=3.0, input_2=4.0),
            self._makeExample(input_1=5.0, input_2=6.0),
        ]

        with self.assertRaisesRegex(
                ValueError,
                'First dimension does not correspond with batch size.'):
            with beam.Pipeline() as pipeline:
                # pylint: disable=no-value-for-parameter
                _ = (pipeline
                     | 'Create' >> beam.Create(
                         [e.SerializeToString() for e in examples])
                     | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3)
                     | 'ToExtracts' >> beam.Map(_record_batch_to_extracts)
                     | 'ModelSignatures' >> beam.ParDo(
                         model_util.ModelSignaturesDoFn(
                             eval_config=eval_config,
                             eval_shared_models=eval_shared_models,
                             signature_names=signature_names,
                             default_signature_names=None,
                             prefer_dict_outputs=False,
                             tensor_adapter_config=tensor_adapter_config)))
def compute_cis(scenario: str, methodology: str, num_trials: int,
                num_examples_per_trial: int, output_dir: str) -> None:
    """Computes a collection of CIs and the population values for a scenario."""
    if scenario == _BINARY_CLASSIFICATION_SCENARIO:
        eval_config, example_gen_fn = get_binary_classification_scenario()
    elif scenario == _REGRESSION_SCENARIO:
        eval_config, example_gen_fn = get_regression_scenario()
    else:
        raise ValueError(f'Unexpected scenario {scenario}. '
                         f'Expected one of {_SCENARIOS}')
    eval_config.options.compute_confidence_intervals.value = True
    eval_config.options.confidence_intervals.method = (
        config_pb2.ConfidenceIntervalOptions.ConfidenceIntervalMethod.Value(
            methodology))
    pipeline_options = beam.options.pipeline_options.PipelineOptions(
        FLAGS.pipeline_options.split(','))
    with beam.Pipeline(options=pipeline_options) as pipeline:
        tfx_io = tf_example_record.TFExampleBeamRecord(
            physical_format='generated',
            raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        inputs_per_trial = []
        for i in range(num_trials):
            inputs = (pipeline
                      | f'CreateExamples[{i}]' >> beam.Create(
                          example_gen_fn(num_examples_per_trial))
                      | f'Serialize[{i}]' >>
                      beam.Map(lambda example: example.SerializeToString())
                      | f'BatchExamples[{i}]' >> tfx_io.BeamSource())
            inputs_per_trial.append(inputs)

            trial_output_dir = os.path.join(output_dir, str(i))
            _ = (inputs
                 | f'Evaluate[{i}]' >>
                 model_eval_lib.ExtractEvaluateAndWriteResults(
                     eval_config=eval_config, output_path=trial_output_dir))
        population_output_dir = os.path.join(output_dir,
                                             _POPULATION_OUTPUT_NAME)
        _ = (inputs_per_trial
             | 'FlattenInputs' >> beam.Flatten()
             | 'EvaluatePopulation' >>
             model_eval_lib.ExtractEvaluateAndWriteResults(
                 eval_config=eval_config, output_path=population_output_dir))
    def test_features_extractor(self):
        model_spec = config_pb2.ModelSpec()
        eval_config = config_pb2.EvalConfig(model_specs=[model_spec])
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)

        schema = text_format.Parse(
            """
        feature {
          name: "example_weight"
          type: FLOAT
        }
        feature {
          name: "fixed_int"
          type: INT
        }
        feature {
          name: "fixed_float"
          type: FLOAT
        }
        feature {
          name: "fixed_string"
          type: BYTES
        }
        """, schema_pb2.Schema())
        tfx_io = tf_example_record.TFExampleBeamRecord(
            schema=schema,
            raw_record_column_name=constants.ARROW_INPUT_COLUMN,
            physical_format='inmem',
            telemetry_descriptors=['testing'])

        example_kwargs = [
            {
                'fixed_int': 1,
                'fixed_float': 1.0,
                'fixed_string': 'fixed_string1'
            },
            {
                'fixed_int': 1,
                'fixed_float': 1.0,
                'fixed_string': 'fixed_string2'
            },
            {
                'fixed_int': 2,
                'fixed_float': 0.0,
                'fixed_string': 'fixed_string3'
            },
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create([
                    self._makeExample(**kwargs).SerializeToString()
                    for kwargs in example_kwargs
                ],
                                          reshuffle=False)
                | 'DecodeToRecordBatch' >> tfx_io.BeamSource(batch_size=3)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    self.assertIn(constants.FEATURES_KEY, got[0])
                    self.assertLen(got[0][constants.FEATURES_KEY],
                                   4)  # 4 features
                    self.assertIn('example_weight',
                                  got[0][constants.FEATURES_KEY])
                    # Arrays of type np.object won't compare with assertAllClose
                    self.assertEqual(
                        got[0][constants.FEATURES_KEY]
                        ['example_weight'].tolist(), [None, None, None])
                    self.assertIn('fixed_int', got[0][constants.FEATURES_KEY])
                    self.assertAllClose(
                        got[0][constants.FEATURES_KEY]['fixed_int'],
                        np.array([1, 1, 2]))
                    self.assertIn('fixed_float',
                                  got[0][constants.FEATURES_KEY])
                    self.assertAllClose(
                        got[0][constants.FEATURES_KEY]['fixed_float'],
                        np.array([1.0, 1.0, 0.0]))
                    self.assertIn('fixed_string',
                                  got[0][constants.FEATURES_KEY])
                    # Arrays of type np.object won't compare with assertAllClose
                    self.assertEqual(
                        got[0][
                            constants.FEATURES_KEY]['fixed_string'].tolist(),
                        [b'fixed_string1', b'fixed_string2', b'fixed_string3'])
                    self.assertIn(constants.INPUT_KEY, got[0])
                    self.assertLen(got[0][constants.INPUT_KEY],
                                   3)  # 3 examples

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Esempio n. 10
0
def append_tfma_pipeline(pipeline: beam.Pipeline,
                         me_eval_config: me_proto.EvaluationConfig,
                         problem_type: constants.ProblemType,
                         tfma_format: Optional[bool] = False,
                         json_mode: Optional[bool] = False,
                         schema: Optional[Any] = None):
    """Extend a beam pipeline to add TFMA evaluation given a configuration.

  Args:
    pipeline: A beam pipeline.
    me_eval_config: A ME Evaluation Configuration.
    problem_type: Defines what type of problem to expect.
    tfma_format: If true, use TFMA format, if false use Model Evaluation.
    json_mode: Output metrics in a plain text mode.
    schema: Optional tf.metadata schema. If you need to pass multi-tensor input
      to the model, you need to pass the schema.
  """
    input_files = (
        me_eval_config.data_spec.input_source_spec.jsonl_file_spec.file_names)
    output_path = me_eval_config.output_spec.gcs_sink.path
    data_spec = me_eval_config.data_spec
    weight_column_spec = ColumnSpec(
        me_eval_config.data_spec.example_weight_key_spec
    ) if me_eval_config.data_spec.HasField('example_weight_key_spec') else None
    eval_column_specs = EvaluationColumnSpecs(
        ground_truth_column_spec=ColumnSpec(
            me_eval_config.data_spec.label_key_spec),
        example_weight_column_spec=weight_column_spec,
        predicted_score_column_spec=ColumnSpec(
            data_spec.predicted_score_key_spec)
        if data_spec.HasField('predicted_score_key_spec') else None,
        predicted_label_column_spec=ColumnSpec(
            data_spec.predicted_label_key_spec)
        if data_spec.HasField('predicted_label_key_spec') else None,
        predicted_label_id_column_spec=ColumnSpec(
            data_spec.predicted_label_id_key_spec)
        if data_spec.HasField('predicted_label_id_key_spec') else None)
    class_name_list = list(me_eval_config.data_spec.labels) or None
    quantile_list = list(data_spec.quantiles) or None
    quantile_index = data_spec.quantile_index if data_spec.quantile_index >= 0 else None
    tfma_eval_config = tfma_adapter.METoTFMA(class_name_list).eval_config(
        me_eval_config)
    me_writers = [
        tfma.writers.Writer(
            stage_name='WriteMetrics',
            # pylint:disable=no-value-for-parameter
            ptransform=_write_metrics(output_file=os.path.join(
                output_path, constants.Pipeline.METRICS_KEY),
                                      problem_type=problem_type,
                                      class_labels=class_name_list,
                                      tfma_format=tfma_format,
                                      json_mode=json_mode)),
    ]

    coder = tf_example_record.TFExampleBeamRecord(
        physical_format='inmem',
        schema=schema,
        raw_record_column_name=tfma.ARROW_INPUT_COLUMN,
        telemetry_descriptors=None)
    tensor_adapter_config = None
    if schema is not None:
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=coder.ArrowSchema(),
            tensor_representations=coder.TensorRepresentations())
    _ = (pipeline | 'InputFileList' >> beam.Create(input_files)
         | 'ReadText' >> beam.io.textio.ReadAllFromText()
         | 'ParseData' >> beam.ParDo(
             JSONToSerializedExample(eval_column_specs=eval_column_specs,
                                     class_list=class_name_list,
                                     quantile_list=quantile_list,
                                     quantile_index=quantile_index))
         | 'ExamplesToRecordBatch' >> coder.BeamSource()
         | 'ExtractEvaluateAndWriteResults' >>
         tfma.ExtractEvaluateAndWriteResults(
             eval_config=tfma_eval_config,
             writers=me_writers,
             tensor_adapter_config=tensor_adapter_config))