Esempio n. 1
0
    def testPredictionsExtractorWithRegressionModel(self):
        temp_export_dir = self._getExportDir()
        export_dir, _ = (fixed_prediction_estimator_extra_fields.
                         simple_fixed_prediction_estimator_extra_fields(
                             temp_export_dir, None))

        eval_config = config_pb2.EvalConfig(
            model_specs=[config_pb2.ModelSpec()])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        schema = text_format.Parse(
            """
        feature {
          name: "prediction"
          type: FLOAT
        }
        feature {
          name: "label"
          type: FLOAT
        }
        feature {
          name: "fixed_int"
          type: INT
        }
        feature {
          name: "fixed_float"
          type: FLOAT
        }
        feature {
          name: "fixed_string"
          type: BYTES
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        prediction_extractor = predictions_extractor.PredictionsExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        examples = [
            self._makeExample(prediction=0.2,
                              label=1.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.8,
                              label=0.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string2'),
            self._makeExample(prediction=0.5,
                              label=0.0,
                              fixed_int=2,
                              fixed_float=1.0,
                              fixed_string='fixed_string3')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | prediction_extractor.stage_name >>
                prediction_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    self.assertIn(constants.PREDICTIONS_KEY, got[0])
                    expected_preds = [0.2, 0.8, 0.5]
                    self.assertAlmostEqual(got[0][constants.PREDICTIONS_KEY],
                                           expected_preds)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
    def test_infer_schema_without_string_domain(self):
        statistics = text_format.Parse(
            """
        datasets {
          num_examples: 7
          features: {
            name: 'feature1'
            type: STRING
            string_stats: {
              common_stats: {
                num_missing: 3
                num_non_missing: 4
                min_num_values: 1
                max_num_values: 1
              }
              unique: 3
              rank_histogram {
                buckets {
                  low_rank: 0
                  high_rank: 0
                  label: "a"
                  sample_count: 2.0
                }
                buckets {
                  low_rank: 1
                  high_rank: 1
                  label: "b"
                  sample_count: 1.0
                }
                buckets {
                  low_rank: 2
                  high_rank: 2
                  label: "c"
                  sample_count: 1.0
                }
              }
            }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

        expected_schema = text_format.Parse(
            """
        feature {
          name: "feature1"
          value_count: {
            min: 1
            max: 1
          }
          presence: {
            min_count: 1
          }
          type: BYTES
        }
        """, schema_pb2.Schema())
        validation_api._may_be_set_legacy_flag(expected_schema)

        # Infer the schema from the stats.
        actual_schema = validation_api.infer_schema(statistics,
                                                    max_string_domain_size=2)
        self.assertEqual(actual_schema, expected_schema)
    def test_validate_stats(self):
        schema = text_format.Parse(
            """
        string_domain {
          name: "MyAloneEnum"
          value: "A"
          value: "B"
          value: "C"
        }
        feature {
          name: "annotated_enum"
          value_count {
            min:1
            max:1
          }
          presence {
            min_count: 1
          }
          type: BYTES
          domain: "MyAloneEnum"
        }
        feature {
          name: "ignore_this"
          lifecycle_stage: DEPRECATED
          value_count {
            min:1
          }
          presence {
            min_count: 1
          }
          type: BYTES
        }
        """, schema_pb2.Schema())
        statistics = text_format.Parse(
            """
        datasets{
          num_examples: 10
          features {
            name: 'annotated_enum'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 3
                num_non_missing: 7
                min_num_values: 1
                max_num_values: 1
              }
              unique: 3
              rank_histogram {
                buckets {
                  label: "D"
                  sample_count: 1
                }
              }
            }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
        expected_anomalies = {
            'annotated_enum':
            text_format.Parse(
                """
      description: "Examples contain values missing from the schema: D (?). "
      severity: ERROR
      short_description: "Unexpected string values"
      reason {
        type: ENUM_TYPE_UNEXPECTED_STRING_VALUES
        short_description: "Unexpected string values"
        description: "Examples contain values missing from the schema: D (?). "
      }
            """, anomalies_pb2.AnomalyInfo())
        }

        # Validate the stats.
        anomalies = validation_api.validate_statistics(statistics, schema)
        self._assert_equal_anomalies(anomalies, expected_anomalies)
Esempio n. 4
0
    def export_tfx_schema(self) -> schema_pb2.Schema:
        """
        Create a Tensorflow metadata schema from a FeatureSet.

        Returns:
            Tensorflow metadata schema.

        """
        schema = schema_pb2.Schema()

        # List of attributes to copy from fields in the FeatureSet to feature in
        # Tensorflow metadata schema where the attribute name is the same.
        attributes_to_copy_from_field_to_feature = [
            "name",
            "presence",
            "group_presence",
            "shape",
            "value_count",
            "domain",
            "int_domain",
            "float_domain",
            "string_domain",
            "bool_domain",
            "struct_domain",
            "_natural_language_domain",
            "image_domain",
            "mid_domain",
            "url_domain",
            "time_domain",
            "time_of_day_domain",
        ]

        for _, field in self._fields.items():
            if isinstance(field, Entity):
                continue
            feature = schema_pb2.Feature()
            for attr in attributes_to_copy_from_field_to_feature:
                if getattr(field, attr) is None:
                    # This corresponds to an unset member in the proto Oneof field.
                    continue
                if issubclass(type(getattr(feature, attr)), Message):
                    # Proto message field to copy is an "embedded" field, so MergeFrom()
                    # method must be used.
                    getattr(feature, attr).MergeFrom(getattr(field, attr))
                elif issubclass(type(getattr(feature, attr)),
                                (int, str, bool)):
                    # Proto message field is a simple Python type, so setattr()
                    # can be used.
                    setattr(feature, attr, getattr(field, attr))
                else:
                    warnings.warn(
                        f"Attribute '{attr}' cannot be copied from Field "
                        f"'{field.name}' in FeatureSet '{self.name}' to a "
                        f"Feature in the Tensorflow metadata schema, because"
                        f"the type is neither a Protobuf message or Python "
                        f"int, str and bool")
            # "type" attr is handled separately because the attribute name is different
            # ("dtype" in field and "type" in Feature) and "type" in Feature is only
            # a subset of "dtype".
            feature.type = field.dtype.to_tfx_schema_feature_type()
            schema.feature.append(feature)

        return schema
Esempio n. 5
0
def get_dataset_feature_statistics(builder, split):
    """Calculate statistics for the specified split."""
    statistics = statistics_pb2.DatasetFeatureStatistics()

    # Make this to the best of our abilities.
    schema = schema_pb2.Schema()

    dataset = builder.as_dataset(split=split)

    # Just computing the number of examples for now.
    statistics.num_examples = 0

    # Feature dictionaries.
    feature_to_num_examples = collections.defaultdict(int)
    feature_to_min = {}
    feature_to_max = {}

    np_dataset = dataset_utils.as_numpy(dataset)
    for example in utils.tqdm(np_dataset, unit=" examples", leave=False):
        statistics.num_examples += 1

        assert isinstance(example, dict)

        feature_names = sorted(example.keys())
        for feature_name in feature_names:

            # Update the number of examples this feature appears in.
            feature_to_num_examples[feature_name] += 1

            feature_np = example[feature_name]

            # For compatibility in graph and eager mode, we can get PODs here and
            # everything may not be neatly wrapped up in numpy's ndarray.

            feature_dtype = type(feature_np)

            if isinstance(feature_np, np.ndarray):
                # If we have an empty array, then don't proceed further with computing
                # statistics on it.
                if feature_np.size == 0:
                    continue

                feature_dtype = feature_np.dtype.type

            feature_min, feature_max = None, None
            is_numeric = (np.issubdtype(feature_dtype, np.number)
                          or feature_dtype == np.bool_)
            if is_numeric:
                feature_min = np.min(feature_np)
                feature_max = np.max(feature_np)

            # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add
            # logic for that.

            # Set or update the min, max.
            if is_numeric:
                if ((feature_name not in feature_to_min)
                        or (feature_to_min[feature_name] > feature_min)):
                    feature_to_min[feature_name] = feature_min

                if ((feature_name not in feature_to_max)
                        or (feature_to_max[feature_name] < feature_max)):
                    feature_to_max[feature_name] = feature_max

    # Start here, we've processed all examples.

    output_shapes_dict = dataset.output_shapes
    output_types_dict = dataset.output_types

    for feature_name in sorted(feature_to_num_examples.keys()):
        # Try to fill in the schema.
        feature = schema.feature.add()
        feature.name = feature_name

        # TODO(afrozm): Make this work with nested structures, currently the Schema
        # proto has no support for it.
        maybe_feature_shape = output_shapes_dict[feature_name]
        if not isinstance(maybe_feature_shape, tf.TensorShape):
            logging.error(
                "Statistics generation doesn't work for nested structures yet")
            continue

        for dim in maybe_feature_shape.as_list():
            # We denote `None`s as -1 in the shape proto.
            feature.shape.dim.add().size = dim if dim else -1
        feature_type = output_types_dict[feature_name]
        feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES)

        common_statistics = statistics_pb2.CommonStatistics()
        common_statistics.num_non_missing = feature_to_num_examples[
            feature_name]
        common_statistics.num_missing = (statistics.num_examples -
                                         common_statistics.num_non_missing)

        feature_name_statistics = statistics.features.add()
        feature_name_statistics.name = feature_name

        # TODO(afrozm): This can be skipped, since type information was added to
        # the Schema.
        feature_name_statistics.type = _SCHEMA_TYPE_MAP.get(
            feature.type, statistics_pb2.FeatureNameStatistics.BYTES)

        if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT:
            numeric_statistics = statistics_pb2.NumericStatistics()
            numeric_statistics.min = feature_to_min[feature_name]
            numeric_statistics.max = feature_to_max[feature_name]
            numeric_statistics.common_stats.CopyFrom(common_statistics)
            feature_name_statistics.num_stats.CopyFrom(numeric_statistics)
        else:
            # Let's shove it into BytesStatistics for now.
            bytes_statistics = statistics_pb2.BytesStatistics()
            bytes_statistics.common_stats.CopyFrom(common_statistics)
            feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics)

    return statistics, schema
Esempio n. 6
0
 def test_validate_stats_invalid_statistics_input(self):
     schema = schema_pb2.Schema()
     with self.assertRaisesRegexp(TypeError, 'statistics is of type.*'):
         _ = validation_api.validate_statistics({}, schema)
Esempio n. 7
0
  def testPreprocessingFn(self):
    schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt')
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    feature_spec = taxi_utils_bqml._get_raw_feature_spec(schema)
    working_dir = self.get_temp_dir()
    transform_output_path = os.path.join(working_dir, 'transform_output')
    transformed_examples_path = os.path.join(
        working_dir, 'transformed_examples')

    # Run very simplified version of executor logic.
    # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
    # Generate legacy `DatasetMetadata` object.  Future version of Transform
    # will accept the `Schema` proto directly.
    legacy_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec(feature_spec))
    decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema)
    with beam.Pipeline() as p:
      with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
        examples = (
            p
            | 'ReadTrainData' >> beam.io.ReadFromTFRecord(
                os.path.join(self._testdata_path, 'csv_example_gen/train/*'),
                coder=beam.coders.BytesCoder(),
                # TODO(b/114938612): Eventually remove this override.
                validate=False)
            | 'DecodeTrainData' >> beam.Map(decoder.decode))
        (transformed_examples, transformed_metadata), transform_fn = (
            (examples, legacy_metadata)
            | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
                taxi_utils_bqml.preprocessing_fn))

        # WriteTransformFn writes transform_fn and metadata to subdirectories
        # tensorflow_transform.SAVED_MODEL_DIR and
        # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
        # pylint: disable=expression-not-assigned
        (transform_fn
         | 'WriteTransformFn' >> tft_beam.WriteTransformFn(
             transform_output_path))

        encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        (transformed_examples
         | 'EncodeTrainData' >> beam.Map(encoder.encode)
         | 'WriteTrainData' >> beam.io.WriteToTFRecord(
             os.path.join(transformed_examples_path,
                          'train/transformed_examples.gz'),
             coder=beam.coders.BytesCoder()))
        # pylint: enable=expression-not-assigned

    # Verify the output matches golden output.
    # NOTE: we don't verify that transformed examples match golden output.
    expected_transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(
            self._testdata_path,
            'transform/transform_output/transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(transform_output_path,
                     'transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    # Clear annotations so we only have to test main schema.
    for feature in transformed_schema.feature:
      feature.ClearField('annotation')
    transformed_schema.ClearField('annotation')
    self.assertEqual(transformed_schema, expected_transformed_schema)
Esempio n. 8
0
  def test_mi_classif_with_int_label_and_categorical_feature(self):
    batch = {}
    batch["label_key"] = [
        np.array([0]),
        np.array([2]),
        np.array([0]),
        np.array([1]),
        np.array([2]),
        np.array([1]),
        np.array([1]),
        np.array([0]),
        np.array([2]),
        np.array([1]),
        np.array([0])
    ]
    # A categorical feature that maps directly on to the label.
    batch["perfect_feature"] = [
        np.array(["Red"]),
        np.array(["Blue"]),
        np.array(["Red"]),
        np.array(["Green"]),
        np.array(["Blue"]),
        np.array(["Green"]),
        np.array(["Green"]),
        np.array(["Red"]),
        np.array(["Blue"]),
        np.array(["Green"]),
        np.array(["Red"])
    ]

    schema = text_format.Parse(
        """
        feature {
          name: "label_key"
          type: INT
          int_domain {
            is_categorical: true
          }
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "perfect_feature"
          type: BYTES
          shape {
            dim {
              size: 1
            }
          }
        }
        """, schema_pb2.Schema())

    expected = text_format.Parse(
        """
        features {
          name: 'perfect_feature'
          custom_stats {
            name: 'sklearn_adjusted_mutual_information'
            num: 0.9297553
          }
          custom_stats {
            name: 'sklearn_mutual_information'
            num: 1.0900597
          }
        }""", statistics_pb2.DatasetFeatureStatistics())
    self._assert_mi_output_equal(batch, expected, schema, "label_key")
Esempio n. 9
0
  def test_mi_with_imputed_numerical_feature(self):
    batch = {}
    batch["label_key"] = [
        np.array([0.1]),
        np.array([0.2]),
        np.array([0.8]),
        np.array([0.7]),
        np.array([0.2]),
        np.array([0.2]),
        np.array([0.3]),
        np.array([0.1]),
        np.array([0.2]),
        np.array([0.8]),
        np.array([0.7]),
        np.array([0.2]),
        np.array([0.2]),
        np.array([0.3])
    ]
    batch["fa"] = [
        np.array([0.1]),
        np.array([0.2]),
        np.array([0.8]),
        np.array([0.7]),
        np.array([0.2]),
        np.array([np.NaN]), None,
        np.array([0.1]),
        np.array([0.2]),
        np.array([0.8]),
        np.array([0.7]),
        np.array([0.2]),
        np.array([0.2]),
        np.array([0.3])
    ]

    schema = text_format.Parse(
        """
        feature {
          name: "fa"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "label_key"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        """, schema_pb2.Schema())

    expected = text_format.Parse(
        """
        features {
          name: "fa"
          custom_stats {
            name: "sklearn_adjusted_mutual_information"
            num: 0.3849224
          }
          custom_stats {
            name: "sklearn_mutual_information"
            num: 0.4063665
          }
        }""", statistics_pb2.DatasetFeatureStatistics())
    self._assert_mi_output_equal(batch, expected, schema, "label_key")
Esempio n. 10
0
    def testPredictionsExtractorWithoutEvalSharedModel(self):
        model_spec1 = config_pb2.ModelSpec(name='model1',
                                           prediction_key='prediction')
        model_spec2 = config_pb2.ModelSpec(name='model2',
                                           prediction_keys={
                                               'output1': 'prediction1',
                                               'output2': 'prediction2'
                                           })
        eval_config = config_pb2.EvalConfig(
            model_specs=[model_spec1, model_spec2])
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        prediction_extractor = predictions_extractor.PredictionsExtractor(
            eval_config)

        schema = text_format.Parse(
            """
        feature {
          name: "prediction"
          type: FLOAT
        }
        feature {
          name: "prediction1"
          type: FLOAT
        }
        feature {
          name: "prediction2"
          type: FLOAT
        }
        feature {
          name: "fixed_int"
          type: INT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)

        examples = [
            self._makeExample(prediction=1.0,
                              prediction1=1.0,
                              prediction2=0.0,
                              fixed_int=1),
            self._makeExample(prediction=1.0,
                              prediction1=1.0,
                              prediction2=1.0,
                              fixed_int=1)
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | prediction_extractor.stage_name >>
                prediction_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    for model_name in ('model1', 'model2'):
                        self.assertIn(model_name,
                                      got[0][constants.PREDICTIONS_KEY][0])
                    self.assertAlmostEqual(
                        got[0][constants.PREDICTIONS_KEY][0]['model1'],
                        np.array([1.0]))
                    self.assertDictElementsAlmostEqual(
                        got[0][constants.PREDICTIONS_KEY][0]['model2'], {
                            'output1': np.array([1.0]),
                            'output2': np.array([0.0])
                        })

                    for model_name in ('model1', 'model2'):
                        self.assertIn(model_name,
                                      got[0][constants.PREDICTIONS_KEY][1])
                    self.assertAlmostEqual(
                        got[0][constants.PREDICTIONS_KEY][1]['model1'],
                        np.array([1.0]))
                    self.assertDictElementsAlmostEqual(
                        got[0][constants.PREDICTIONS_KEY][1]['model2'], {
                            'output1': np.array([1.0]),
                            'output2': np.array([1.0])
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
  def test_topk_uniques_with_categorical_feature(self):
    examples = [
        pa.Table.from_arrays(
            [pa.array([[12, 23, 34, 12], [45, 23], [12, 12, 34, 45]])], ['fa'])
    ]

    expected_result = [
        text_format.Parse(
            """
      features {
        path {
          step: 'fa'
        }
        type: INT
        string_stats {
          top_values {
            value: '12'
            frequency: 4
          }
          top_values {
            value: '45'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "12"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "45"
              sample_count: 2.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "34"
              sample_count: 2.0
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
    features {
        path {
          step: 'fa'
        }
        type: INT
        string_stats {
          unique: 4
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
    ]

    schema = text_format.Parse(
        """
        feature {
          name: "fa"
          type: INT
          int_domain {
            is_categorical: true
          }
        }
        """, schema_pb2.Schema())
    generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
        schema=schema, num_top_values=2, num_rank_histogram_buckets=3)
    self.assertSlicingAwareTransformOutputEqual(
        examples,
        generator,
        expected_result,
        add_default_slice_key_to_input=True,
        add_default_slice_key_to_output=True)
Esempio n. 12
0
    def testBatchSizeLimit(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = batch_size_limited_classifier.simple_batch_size_limited_classifier(
            None, temp_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        eval_config = config_pb2.EvalConfig(
            model_specs=[config_pb2.ModelSpec()])
        schema = text_format.Parse(
            """
        feature {
          name: "classes"
          type: BYTES
        }
        feature {
          name: "scores"
          type: FLOAT
        }
        feature {
          name: "labels"
          type: BYTES
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        prediction_extractor = predictions_extractor.PredictionsExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        examples = []
        for _ in range(4):
            examples.append(
                self._makeExample(classes='first', scores=0.0, labels='third'))

        with beam.Pipeline() as pipeline:
            predict_extracts = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=1)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | prediction_extractor.stage_name >>
                prediction_extractor.ptransform)

            def check_result(got):
                try:
                    self.assertLen(got, 4)
                    # We can't verify the actual predictions, but we can verify the keys.
                    for item in got:
                        self.assertIn(constants.PREDICTIONS_KEY, item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result, label='result')
Esempio n. 13
0
    def testBatchSizeLimitWithKerasModel(self):
        input1 = tf.keras.layers.Input(shape=(1, ),
                                       batch_size=1,
                                       name='input1')
        input2 = tf.keras.layers.Input(shape=(1, ),
                                       batch_size=1,
                                       name='input2')

        inputs = [input1, input2]
        input_layer = tf.keras.layers.concatenate(inputs)

        def add_1(tensor):
            return tf.add_n([tensor, tf.constant(1.0, shape=(1, 2))])

        assert_layer = tf.keras.layers.Lambda(add_1)(input_layer)

        model = tf.keras.models.Model(inputs, assert_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy,
                      metrics=['accuracy'])

        export_dir = self._getExportDir()
        model.save(export_dir, save_format='tf')

        eval_config = config_pb2.EvalConfig(
            model_specs=[config_pb2.ModelSpec()])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        schema = text_format.Parse(
            """
        tensor_representation_group {
          key: ""
          value {
            tensor_representation {
              key: "input1"
              value {
                dense_tensor {
                  column_name: "input1"
                  shape { dim { size: 1 } }
                }
              }
            }
            tensor_representation {
              key: "input2"
              value {
                dense_tensor {
                  column_name: "input2"
                  shape { dim { size: 1 } }
                }
              }
            }
          }
        }
        feature {
          name: "input1"
          type: FLOAT
        }
        feature {
          name: "input2"
          type: FLOAT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        prediction_extractor = predictions_extractor.PredictionsExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        examples = []
        for _ in range(4):
            examples.append(self._makeExample(input1=0.0, input2=1.0))

        with beam.Pipeline() as pipeline:
            predict_extracts = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=1)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | prediction_extractor.stage_name >>
                prediction_extractor.ptransform)

            # pylint: enable=no-value-for-parameter
            def check_result(got):
                try:
                    self.assertLen(got, 4)
                    # We can't verify the actual predictions, but we can verify the keys.
                    for item in got:
                        self.assertIn(constants.PREDICTIONS_KEY, item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result, label='result')
Esempio n. 14
0
    def testPredictionsExtractorWithSequentialKerasModel(self):
        # Note that the input will be called 'test_input'
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(1,
                                  activation=tf.nn.sigmoid,
                                  input_shape=(2, ),
                                  name='test')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy,
                      metrics=['accuracy'])

        train_features = {'test_input': [[0.0, 0.0], [1.0, 1.0]]}
        labels = [[1], [0]]
        example_weights = [1.0, 0.5]
        dataset = tf.data.Dataset.from_tensor_slices(
            (train_features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
        model.fit(dataset, steps_per_epoch=1)

        export_dir = self._getExportDir()
        model.save(export_dir, save_format='tf')

        eval_config = config_pb2.EvalConfig(
            model_specs=[config_pb2.ModelSpec()])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        schema = text_format.Parse(
            """
        tensor_representation_group {
          key: ""
          value {
            tensor_representation {
              key: "test"
              value {
                dense_tensor {
                  column_name: "test"
                  shape { dim { size: 2 } }
                }
              }
            }
          }
        }
        feature {
          name: "test"
          type: FLOAT
        }
        feature {
          name: "non_model_feature"
          type: INT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        prediction_extractor = predictions_extractor.PredictionsExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        # Notice that the features are 'test' but the model expects 'test_input'.
        # This tests that the PredictExtractor properly handles this case.
        examples = [
            self._makeExample(
                test=[0.0,
                      0.0], non_model_feature=0),  # should be ignored by model
            self._makeExample(
                test=[1.0,
                      1.0], non_model_feature=1),  # should be ignored by model
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | prediction_extractor.stage_name >>
                prediction_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    # We can't verify the actual predictions, but we can verify the keys.
                    for item in got:
                        self.assertIn(constants.PREDICTIONS_KEY, item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Esempio n. 15
0
 def test_get_multivalent_features(self):
     schema = text_format.Parse(
         """
       feature {
         name: "fa"
         shape {
           dim {
             size: 1
           }
         }
       }
       feature {
         name: "fb"
         type: BYTES
         value_count {
           min: 0
           max: 1
         }
       }
       feature {
         name: "fc"
         value_count {
           min: 1
           max: 18
         }
       }
       feature {
         name: "fd"
         value_count {
           min: 1
           max: 1
         }
       }
       feature {
         name: "fe"
         shape {
           dim {
             size: 2
           }
         }
       }
       feature {
         name: "ff"
         shape {
           dim {
             size: 1
           }
           dim {
             size: 1
           }
         }
       }
       feature {
         name: "fg"
         value_count {
           min: 2
         }
       }
       feature {
         name: "fh"
         value_count {
           min: 0
           max: 2
         }
       }""", schema_pb2.Schema())
     expected = set(['fc', 'fe', 'ff', 'fg', 'fh'])
     self.assertEqual(schema_util.get_multivalent_features(schema),
                      expected)
Esempio n. 16
0
  def test_mi_regression_with_float_label_and_numeric_features(self):
    batch = {}
    batch["label_key"] = [
        np.array([0.1]),
        np.array([0.2]),
        np.array([0.8]),
        np.array([0.7]),
        np.array([0.2]),
        np.array([0.3]),
        np.array([0.9]),
        np.array([0.4]),
        np.array([0.1]),
        np.array([0.0]),
        np.array([0.4]),
        np.array([0.6]),
        np.array([0.4]),
        np.array([0.8])
    ]
    # Maps directly onto the label key
    batch["perfect_feature"] = batch["label_key"]
    # Random floats that do not map onto the label
    batch["terrible_feature"] = [
        np.array([0.4]),
        np.array([0.1]),
        np.array([0.4]),
        np.array([0.4]),
        np.array([0.8]),
        np.array([0.7]),
        np.array([0.2]),
        np.array([0.1]),
        np.array([0.0]),
        np.array([0.4]),
        np.array([0.8]),
        np.array([0.2]),
        np.array([0.5]),
        np.array([0.1])
    ]

    schema = text_format.Parse(
        """
        feature {
          name: "perfect_feature"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "terrible_feature"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "label_key"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        """, schema_pb2.Schema())

    expected = text_format.Parse(
        """
        features {
          name: "perfect_feature"
          custom_stats {
            name: "sklearn_adjusted_mutual_information"
            num: 1.0096965
          }
          custom_stats {
            name: "sklearn_mutual_information"
            num: 1.1622766
          }
        }
        features {
          name: "terrible_feature"
          custom_stats {
            name: "sklearn_adjusted_mutual_information"
            num: 0.0211485
          }
          custom_stats {
            name: "sklearn_mutual_information"
            num: 0.0211485
          }
        }""", statistics_pb2.DatasetFeatureStatistics())
    self._assert_mi_output_equal(batch, expected, schema, "label_key")
Esempio n. 17
0
    def test_validate_stats_with_environment(self):
        statistics = text_format.Parse(
            """
        datasets {
          num_examples: 1000
          features {
            name: 'feature'
            type: STRING
            string_stats {
              common_stats {
                num_non_missing: 1000
                min_num_values: 1
                max_num_values: 1
              }
              unique: 3
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        schema = text_format.Parse(
            """
        default_environment: "TRAINING"
        default_environment: "SERVING"
        feature {
          name: "label"
          not_in_environment: "SERVING"
          value_count { min: 1 max: 1 }
          presence { min_count: 1 }
          type: BYTES
        }
        feature {
          name: "feature"
          value_count { min: 1 max: 1 }
          presence { min_count: 1 }
          type: BYTES
        }
        """, schema_pb2.Schema())

        expected_anomalies_training = {
            'label':
            text_format.Parse(
                """
            description: "Column is completely missing"
            severity: ERROR
            short_description: "Column dropped"
            reason {
              type: SCHEMA_MISSING_COLUMN
              short_description: "Column dropped"
              description: "Column is completely missing"
            }
            """, anomalies_pb2.AnomalyInfo())
        }
        # Validate the stats in TRAINING environment.
        anomalies_training = validation_api.validate_statistics(
            statistics, schema, environment='TRAINING')
        self._assert_equal_anomalies(anomalies_training,
                                     expected_anomalies_training)

        # Validate the stats in SERVING environment.
        anomalies_serving = validation_api.validate_statistics(
            statistics, schema, environment='SERVING')
        self._assert_equal_anomalies(anomalies_serving, {})
Esempio n. 18
0
def _read_schema(proto_path):
    """Reads a TF Metadata schema from the given text proto file."""
    result = schema_pb2.Schema()
    with open(proto_path) as fp:
        text_format.Parse(fp.read(), result)
    return result
Esempio n. 19
0
    name: "float_feature"
    type: FLOAT
    value_count {
      min: 4
      max: 4
    }
  }
  feature {
    name: "string_feature"
    type: BYTES
    value_count {
      min: 0
      max: 2
    }
  }
""", schema_pb2.Schema())

_IS_LEGACY_SCHEMA = ("generate_legacy_feature_spec"
                     in schema_pb2.Schema.DESCRIPTOR.fields_by_name)

# Enforce a consistent behavior in inferring TensorRepresentations from the
# schema.
if _IS_LEGACY_SCHEMA:
    _SCHEMA.generate_legacy_feature_spec = False

_EXAMPLES = [
    """
  features {
    feature { key: "int_feature" value { int64_list { value: [1] } }
    }
    feature {
Esempio n. 20
0
  def Do(self, input_dict: Dict[Text, List[types.TfxType]],
         output_dict: Dict[Text, List[types.TfxType]],
         exec_properties: Dict[Text, Any]) -> None:
    """Uses a user-supplied tf.estimator to train a TensorFlow model locally.

    The Trainer Executor invokes a training_fn callback function provided by
    the user via the module_file parameter.  With the tf.estimator returned by
    this function, the Trainer Executor then builds a TensorFlow model using the
    user-provided tf.estimator.

    Args:
      input_dict: Input dict from input key to a list of ML-Metadata Artifacts.
        - transformed_examples: Transformed example.
        - transform_output: Input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: Exported model.
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.

    Returns:
      None

    Raises:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    # TODO(zhitaoli): Deprecate this in a future version.
    if exec_properties.get('custom_config', None):
      cmle_args = exec_properties.get('custom_config',
                                      {}).get('cmle_training_args')
      if cmle_args:
        executor_class_path = '.'.join([Executor.__module__, Executor.__name__])
        tf.logging.warn(
            'Passing \'cmle_training_args\' to trainer directly is deprecated, '
            'please use extension executor at '
            'tfx.extensions.google_cloud_ai_platform.trainer.executor instead')

        return cmle_runner.start_cmle_training(input_dict, output_dict,
                                               exec_properties,
                                               executor_class_path, cmle_args)

    trainer_fn = io_utils.import_func(exec_properties['module_file'],
                                      'trainer_fn')

    # Set up training parameters
    train_files = [
        _all_files_pattern(
            types.get_split_uri(input_dict['transformed_examples'], 'train'))
    ]
    transform_output = types.get_single_uri(input_dict['transform_output'])
    eval_files = [
        _all_files_pattern(
            types.get_split_uri(input_dict['transformed_examples'], 'eval'))
    ]
    schema_file = io_utils.get_only_uri_in_dir(
        types.get_single_uri(input_dict['schema']))

    train_args = trainer_pb2.TrainArgs()
    eval_args = trainer_pb2.EvalArgs()
    json_format.Parse(exec_properties['train_args'], train_args)
    json_format.Parse(exec_properties['eval_args'], eval_args)

    # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
    # num_steps=None.  Conversion of the proto to python will set the default
    # value of an int as 0 so modify the value here.  Tensorflow will raise an
    # error if num_steps <= 0.
    train_steps = train_args.num_steps or None
    eval_steps = eval_args.num_steps or None

    output_path = types.get_single_uri(output_dict['output'])
    serving_model_dir = path_utils.serving_model_dir(output_path)
    eval_model_dir = path_utils.eval_model_dir(output_path)

    # Assemble warm start path if needed.
    warm_start_from = None
    if exec_properties.get('warm_starting') and exec_properties.get(
        'warm_start_from'):
      previous_model_dir = os.path.join(exec_properties['warm_start_from'],
                                        path_utils.SERVING_MODEL_DIR)
      if previous_model_dir and tf.gfile.Exists(
          os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)):
        warm_start_from = previous_model_dir

    # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
    hparams = tf.contrib.training.HParams(
        # A list of uris for train files.
        train_files=train_files,
        # A single uri for transform graph produced by TFT.
        transform_output=transform_output,
        # A single uri for the output directory of the serving model.
        serving_model_dir=serving_model_dir,
        # A list of uris for eval files.
        eval_files=eval_files,
        # A single uri for schema file.
        schema_file=schema_file,
        # Number of train steps.
        train_steps=train_steps,
        # Number of eval steps.
        eval_steps=eval_steps,
        # A single uri for the model directory to warm start from.
        warm_start_from=warm_start_from)

    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

    training_spec = trainer_fn(hparams, schema)

    # Train the model
    tf.logging.info('Training model.')
    tf.estimator.train_and_evaluate(training_spec['estimator'],
                                    training_spec['train_spec'],
                                    training_spec['eval_spec'])
    tf.logging.info('Training complete.  Model written to %s',
                    serving_model_dir)

    # Export an eval savedmodel for TFMA
    tf.logging.info('Exporting eval_savedmodel for TFMA.')
    tfma.export.export_eval_savedmodel(
        estimator=training_spec['estimator'],
        export_dir_base=eval_model_dir,
        eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

    tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
Esempio n. 21
0
    def testTFJSPredictExtractorWithKerasModel(self, multi_model,
                                               multi_output):
        input1 = tf.keras.layers.Input(shape=(1, ), name='input1')
        input2 = tf.keras.layers.Input(shape=(1, ), name='input2')
        inputs = [input1, input2]
        input_layer = tf.keras.layers.concatenate(inputs)
        output_layers = {}
        output_layers['output1'] = (tf.keras.layers.Dense(
            1, activation=tf.nn.sigmoid, name='output1')(input_layer))
        if multi_output:
            output_layers['output2'] = (tf.keras.layers.Dense(
                1, activation=tf.nn.sigmoid, name='output2')(input_layer))

        model = tf.keras.models.Model(inputs, output_layers)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy,
                      metrics=['accuracy'])

        train_features = {'input1': [[0.0], [1.0]], 'input2': [[1.0], [0.0]]}
        labels = {'output1': [[1], [0]]}
        if multi_output:
            labels['output2'] = [[1], [0]]

        example_weights = {'output1': [1.0, 0.5]}
        if multi_output:
            example_weights['output2'] = [1.0, 0.5]
        dataset = tf.data.Dataset.from_tensor_slices(
            (train_features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
        model.fit(dataset, steps_per_epoch=1)

        src_model_path = tempfile.mkdtemp()
        model.save(src_model_path)

        dst_model_path = tempfile.mkdtemp()
        converter.convert([
            '--input_format=tf_saved_model',
            '--saved_model_tags=serve',
            '--signature_name=serving_default',
            src_model_path,
            dst_model_path,
        ])

        model_specs = [config.ModelSpec(name='model1', model_type='tf_js')]
        if multi_model:
            model_specs.append(
                config.ModelSpec(name='model2', model_type='tf_js'))

        eval_config = config.EvalConfig(model_specs=model_specs)
        eval_shared_models = [
            self.createTestEvalSharedModel(
                model_name='model1',
                eval_saved_model_path=dst_model_path,
                model_type='tf_js')
        ]
        if multi_model:
            eval_shared_models.append(
                self.createTestEvalSharedModel(
                    model_name='model2',
                    eval_saved_model_path=dst_model_path,
                    model_type='tf_js'))

        schema = text_format.Parse(
            """
        feature {
          name: "input1"
          type: FLOAT
        }
        feature {
          name: "input2"
          type: FLOAT
        }
        feature {
          name: "non_model_feature"
          type: INT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        predictor = tfjs_predict_extractor.TFJSPredictExtractor(
            eval_config=eval_config, eval_shared_model=eval_shared_models)

        examples = [
            self._makeExample(input1=0.0, input2=1.0, non_model_feature=0),
            self._makeExample(input1=1.0, input2=0.0, non_model_feature=1),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | predictor.stage_name >> predictor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got = got[0]
                    self.assertIn(constants.PREDICTIONS_KEY, got)
                    self.assertLen(got[constants.PREDICTIONS_KEY], 2)

                    for item in got[constants.PREDICTIONS_KEY]:
                        if multi_model:
                            self.assertIn('model1', item)
                            self.assertIn('model2', item)
                            if multi_output:
                                self.assertIn('Identity', item['model1'])
                                self.assertIn('Identity_1', item['model1'])

                        elif multi_output:
                            self.assertIn('Identity', item)
                            self.assertIn('Identity_1', item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Esempio n. 22
0
def infer_schema(
    statistics: statistics_pb2.DatasetFeatureStatisticsList,
    infer_feature_shape: bool = True,
    max_string_domain_size: int = 100,
    schema_transformations: Optional[List[
        Callable[[schema_pb2.Schema, statistics_pb2.DatasetFeatureStatistics],
                 schema_pb2.Schema]]] = None
) -> schema_pb2.Schema:
    """Infers schema from the input statistics.

  Args:
    statistics: A DatasetFeatureStatisticsList protocol buffer. Schema inference
      is currently supported only for lists with a single
      DatasetFeatureStatistics proto or lists with multiple
      DatasetFeatureStatistics protos corresponding to data slices that include
      the default slice (i.e., the slice with all examples). If a list with
      multiple DatasetFeatureStatistics protos is used, this function will infer
      the schema from the statistics corresponding to the default slice.
    infer_feature_shape: A boolean to indicate if shape of the features need to
      be inferred from the statistics.
    max_string_domain_size: Maximum size of the domain of a string feature in
        order to be interpreted as a categorical feature.
    schema_transformations: List of transformation functions to apply to the
        auto-inferred schema. Each transformation function should take the
        schema and statistics as input and should return the transformed schema.
        The transformations are applied in the order provided in the list.

  Returns:
    A Schema protocol buffer.

  Raises:
    TypeError: If the input argument is not of the expected type.
    ValueError: If the input statistics proto contains multiple datasets, none
        of which corresponds to the default slice.
  """
    if not isinstance(statistics, statistics_pb2.DatasetFeatureStatisticsList):
        raise TypeError('statistics is of type %s, should be '
                        'a DatasetFeatureStatisticsList proto.' %
                        type(statistics).__name__)

    # This will raise an exception if there are multiple datasets, none of which
    # corresponds to the default slice.
    dataset_statistics = _get_default_dataset_statistics(statistics)

    _check_for_unsupported_stats_fields(dataset_statistics, 'statistics')

    schema_proto_string = pywrap_tensorflow_data_validation.InferSchema(
        tf.compat.as_bytes(dataset_statistics.SerializeToString()),
        max_string_domain_size)

    # Parse the serialized Schema proto.
    result = schema_pb2.Schema()
    result.ParseFromString(schema_proto_string)

    _may_be_set_legacy_flag(result)

    # TODO(b/113605666): Push this shape inference logic into example validation
    # code.
    if infer_feature_shape:
        _infer_shape(result)

    if schema_transformations is not None:
        for transformation_fn in schema_transformations:
            result = transformation_fn(result, statistics.datasets[0])
    return result
Esempio n. 23
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Uses a user-supplied tf.estimator to train a TensorFlow model locally.

    The Trainer Executor invokes a training_fn callback function provided by
    the user via the module_file parameter.  With the tf.estimator returned by
    this function, the Trainer Executor then builds a TensorFlow model using the
    user-provided tf.estimator.

    Args:
      input_dict: Input dict from input key to a list of ML-Metadata Artifacts.
        - examples: Examples used for training, must include 'train' and 'eval'
          splits.
        - transform_output: Optional input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: Exported model.
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.
        - custom_config: Optional. Additional parameters to pass to trainer
          function.

    Returns:
      None

    Raises:
      ValueError: When neither or both of 'module_file' and 'trainer_fn'
        are present in 'exec_properties'.
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    fn_args = self._GetFnArgs(input_dict, output_dict, exec_properties)
    trainer_fn = self._GetFn(exec_properties, 'trainer_fn')

    schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema())

    training_spec = trainer_fn(fn_args, schema)

    # Train the model
    absl.logging.info('Training model.')
    tf.estimator.train_and_evaluate(training_spec['estimator'],
                                    training_spec['train_spec'],
                                    training_spec['eval_spec'])
    absl.logging.info('Training complete.  Model written to %s',
                      fn_args.serving_model_dir)

    # Export an eval savedmodel for TFMA. If distributed training, it must only
    # be written by the chief worker, as would be done for serving savedmodel.
    if _is_chief():
      absl.logging.info('Exporting eval_savedmodel for TFMA.')
      tfma.export.export_eval_savedmodel(
          estimator=training_spec['estimator'],
          export_dir_base=fn_args.eval_model_dir,
          eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

      absl.logging.info('Exported eval_savedmodel to %s.',
                        fn_args.eval_model_dir)
    else:
      absl.logging.info(
          'eval_savedmodel export for TFMA is skipped because '
          'this is not the chief worker.'
      )
 def test_topk_uniques_combiner_with_categorical_feature(self):
   # fa: 4 12, 2 23, 2 34, 2 45
   batches = [
       pa.Table.from_arrays([pa.array([[12, 23, 34, 12], [45, 23]])], ['fa']),
       pa.Table.from_arrays([pa.array([[12, 12, 34, 45]])], ['fa']),
       pa.Table.from_arrays(
           [pa.array([None, None, None, None], type=pa.null())], ['fa']),
   ]
   expected_result = {
       types.FeaturePath(['fa']):
           text_format.Parse(
               """
               path {
                 step: 'fa'
               }
               type: INT
               string_stats {
                 unique: 4
                 top_values {
                   value: '12'
                   frequency: 4
                 }
                 top_values {
                   value: '45'
                   frequency: 2
                 }
                 top_values {
                   value: '34'
                   frequency: 2
                 }
                 top_values {
                   value: '23'
                   frequency: 2
                 }
                 rank_histogram {
                   buckets {
                     low_rank: 0
                     high_rank: 0
                     label: "12"
                     sample_count: 4.0
                   }
                   buckets {
                     low_rank: 1
                     high_rank: 1
                     label: "45"
                     sample_count: 2.0
                   }
                   buckets {
                     low_rank: 2
                     high_rank: 2
                     label: "34"
                     sample_count: 2.0
                   }
                 }
             }""", statistics_pb2.FeatureNameStatistics())
   }
   schema = text_format.Parse(
       """
       feature {
         name: "fa"
         type: INT
         int_domain {
           is_categorical: true
         }
       }
       """, schema_pb2.Schema())
   generator = (
       top_k_uniques_combiner_stats_generator
       .TopKUniquesCombinerStatsGenerator(
           schema=schema, num_top_values=4, num_rank_histogram_buckets=3))
   self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_identify_anomalous_examples_with_max_examples_per_anomaly(self):
     examples = [{
         'annotated_enum': np.array(['D'])
     }, {
         'annotated_enum': np.array(['D'])
     }, {
         'annotated_enum': np.array(['C'])
     }, {
         'feature_not_in_schema': np.array([1])
     }, {
         'feature_not_in_schema': np.array([1])
     }]
     schema = text_format.Parse(
         """
     string_domain {
       name: "MyAloneEnum"
       value: "A"
       value: "B"
       value: "C"
     }
     feature {
       name: "annotated_enum"
       value_count {
         min:1
         max:1
       }
       presence {
         min_count: 0
       }
       type: BYTES
       domain: "MyAloneEnum"
     }
     feature {
       name: "ignore_this"
       lifecycle_stage: DEPRECATED
       value_count {
         min:1
       }
       presence {
         min_count: 1
       }
       type: BYTES
     }
     """, schema_pb2.Schema())
     options = stats_options.StatsOptions(schema=schema)
     max_examples_per_anomaly = 1
     expected_result = [
         ('annotated_enum_ENUM_TYPE_UNEXPECTED_STRING_VALUES', [{
             'annotated_enum':
             np.array(['D'])
         }]),
         ('feature_not_in_schema_SCHEMA_NEW_COLUMN', [{
             'feature_not_in_schema':
             np.array([1])
         }])
     ]
     with beam.Pipeline() as p:
         result = (p | beam.Create(examples)
                   | validation_api.IdentifyAnomalousExamples(
                       options, max_examples_per_anomaly))
         util.assert_that(result, util.equal_to(expected_result))
  def test_topk_struct_leaves(self):
    batches = [
        pa.Table.from_arrays([
            pa.array([[1.0], [2.0]]),
            pa.array([[{
                'f1': ['a', 'b'],
                'f2': [1, 2]
            }, {
                'f1': ['b'],
            }], [{
                'f1': ['c', 'd'],
                'f2': [2, 3]
            }, {
                'f2': [3]
            }]]),
        ], ['w', 'c']),
        pa.Table.from_arrays([
            pa.array([[3.0]]),
            pa.array([[{
                'f1': ['d'],
                'f2': [4]
            }]]),
        ], ['w', 'c']),
    ]
    schema = text_format.Parse(
        """
        feature {
          name: "c"
          type: STRUCT
          struct_domain {
            feature {
              name: "f2"
              type: INT
              int_domain {
                is_categorical: true
              }
            }
          }
        }
        """, schema_pb2.Schema())
    expected_result = {
        types.FeaturePath(['c', 'f1']):
            text_format.Parse("""
              type: STRING
              string_stats {
                unique: 4
                top_values {
                  value: "d"
                  frequency: 2.0
                }
                top_values {
                  value: "b"
                  frequency: 2.0
                }
                top_values {
                  value: "c"
                  frequency: 1.0
                }
                rank_histogram {
                  buckets {
                    label: "d"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "b"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "c"
                    sample_count: 1.0
                  }
                }
                weighted_string_stats {
                  top_values {
                    value: "d"
                    frequency: 5.0
                  }
                  top_values {
                    value: "c"
                    frequency: 2.0
                  }
                  top_values {
                    value: "b"
                    frequency: 2.0
                  }
                  rank_histogram {
                    buckets {
                      label: "d"
                      sample_count: 5.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "c"
                      sample_count: 2.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "b"
                      sample_count: 2.0
                    }
                  }
                }
              }
              path {
                step: "c"
                step: "f1"
              }""", statistics_pb2.FeatureNameStatistics()),
        types.FeaturePath(['c', 'f2']):
            text_format.Parse("""
              string_stats {
                unique: 4
                top_values {
                  value: "3"
                  frequency: 2.0
                }
                top_values {
                  value: "2"
                  frequency: 2.0
                }
                top_values {
                  value: "4"
                  frequency: 1.0
                }
                rank_histogram {
                  buckets {
                    label: "3"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "2"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "4"
                    sample_count: 1.0
                  }
                }
                weighted_string_stats {
                  top_values {
                    value: "3"
                    frequency: 4.0
                  }
                  top_values {
                    value: "4"
                    frequency: 3.0
                  }
                  top_values {
                    value: "2"
                    frequency: 3.0
                  }
                  rank_histogram {
                    buckets {
                      label: "3"
                      sample_count: 4.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "4"
                      sample_count: 3.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "2"
                      sample_count: 3.0
                    }
                  }
                }
              }
              path {
                step: "c"
                step: "f2"
              }""", statistics_pb2.FeatureNameStatistics()),
    }
    generator = (
        top_k_uniques_combiner_stats_generator
        .TopKUniquesCombinerStatsGenerator(
            schema=schema,
            weight_feature='w',
            num_top_values=3,
            num_rank_histogram_buckets=3))

    self.assertCombinerOutputEqual(batches, generator, expected_result)
    def test_infer_schema_with_infer_shape(self):
        statistics = text_format.Parse(
            """
        datasets {
          num_examples: 7
          features: {
            name: 'feature1'
            type: STRING
            string_stats: {
              common_stats: {
                num_missing: 0
                num_non_missing: 7
                min_num_values: 1
                max_num_values: 1
              }
              unique: 3
            }
          }
          features: {
            name: 'feature2'
            type: STRING
            string_stats: {
              common_stats: {
                num_missing: 0
                num_non_missing: 7
                min_num_values: 3
                max_num_values: 3
              }
              unique: 5
            }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

        expected_schema = text_format.Parse(
            """
        feature {
          name: "feature1"
          shape { dim { size: 1 } }
          presence: {
            min_fraction: 1.0
            min_count: 1
          }
          type: BYTES
        }
        feature {
          name: "feature2"
          value_count: { min: 1 }
          presence: {
            min_fraction: 1.0
            min_count: 1
          }
          type: BYTES
        }
        """, schema_pb2.Schema())
        validation_api._may_be_set_legacy_flag(expected_schema)

        # Infer the schema from the stats.
        actual_schema = validation_api.infer_schema(statistics,
                                                    infer_feature_shape=True)
        self.assertEqual(actual_schema, expected_schema)
Esempio n. 28
0
 def test_set_domain_invalid_domain(self):
     with self.assertRaisesRegexp(TypeError, 'domain is of type'):
         schema_util.set_domain(schema_pb2.Schema(), 'feature', {})
    def test_validate_stats_with_serving_stats(self):
        statistics = text_format.Parse(
            """
        datasets {
          num_examples: 10
          features {
            name: 'bar'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 0
                num_non_missing: 10
                max_num_values: 1
              }
              rank_histogram {
                buckets { label: "a" sample_count: 1 }
                buckets { label: "b" sample_count: 2 }
                buckets { label: "c" sample_count: 7 }
              }
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        serving_statistics = text_format.Parse(
            """
        datasets {
          num_examples: 10
          features {
            name: 'bar'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 0
                num_non_missing: 10
                max_num_values: 1
              }
              rank_histogram {
                buckets { label: "a" sample_count: 3 }
                buckets { label: "b" sample_count: 1 }
                buckets { label: "c" sample_count: 6 }
              }
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        schema = text_format.Parse(
            """
        feature {
          name: 'bar'
          type: BYTES
          skew_comparator {
            infinity_norm { threshold: 0.1}
          }
        }""", schema_pb2.Schema())

        expected_anomalies = {
            'bar':
            text_format.Parse(self._bar_anomaly_info,
                              anomalies_pb2.AnomalyInfo())
        }
        # Validate the stats.
        anomalies = validation_api.validate_statistics(
            statistics, schema, serving_statistics=serving_statistics)
        self._assert_equal_anomalies(anomalies, expected_anomalies)
Esempio n. 30
0
    def testPredictionsExtractorWithMultiModels(self):
        temp_export_dir = self._getExportDir()
        export_dir1, _ = multi_head.simple_multi_head(temp_export_dir, None)
        export_dir2, _ = multi_head.simple_multi_head(temp_export_dir, None)

        eval_config = config_pb2.EvalConfig(model_specs=[
            config_pb2.ModelSpec(name='model1'),
            config_pb2.ModelSpec(name='model2')
        ])
        eval_shared_model1 = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir1, tags=[tf.saved_model.SERVING])
        eval_shared_model2 = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir2, tags=[tf.saved_model.SERVING])
        schema = text_format.Parse(
            """
        feature {
          name: "age"
          type: FLOAT
        }
        feature {
          name: "langauge"
          type: BYTES
        }
        feature {
          name: "english_label"
          type: FLOAT
        }
        feature {
          name: "chinese_label"
          type: FLOAT
        }
        feature {
          name: "other_label"
          type: FLOAT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        prediction_extractor = predictions_extractor.PredictionsExtractor(
            eval_config=eval_config,
            eval_shared_model={
                'model1': eval_shared_model1,
                'model2': eval_shared_model2
            },
            tensor_adapter_config=tensor_adapter_config)

        examples = [
            self._makeExample(age=1.0,
                              language='english',
                              english_label=1.0,
                              chinese_label=0.0,
                              other_label=0.0),
            self._makeExample(age=1.0,
                              language='chinese',
                              english_label=0.0,
                              chinese_label=1.0,
                              other_label=0.0),
            self._makeExample(age=2.0,
                              language='english',
                              english_label=1.0,
                              chinese_label=0.0,
                              other_label=0.0),
            self._makeExample(age=2.0,
                              language='other',
                              english_label=0.0,
                              chinese_label=1.0,
                              other_label=1.0)
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=4)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | prediction_extractor.stage_name >>
                prediction_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    for item in got:
                        # We can't verify the actual predictions, but we can verify the keys
                        self.assertIn(constants.PREDICTIONS_KEY, item)
                        for pred in item[constants.PREDICTIONS_KEY]:
                            for model_name in ('model1', 'model2'):
                                self.assertIn(model_name, pred)
                                for output_name in ('chinese_head',
                                                    'english_head',
                                                    'other_head'):
                                    for pred_key in ('logistic',
                                                     'probabilities',
                                                     'all_classes'):
                                        self.assertIn(
                                            output_name + '/' + pred_key,
                                            pred[model_name])

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')