Beispiel #1
0
    def compare_stats_for_examples(self,
                                   examples_id,
                                   other_examples_id,
                                   name='',
                                   other_name=''):
        """Compares stats for `examples_id` and `other_examples_id`.

    Args:
      examples_id: A `int` indicating the id of one `TFXArtifactTypes.EXAMPLES`
          artifact.
      other_examples_id: A `int` indicating the id of another
          `TFXArtifactTypes.EXAMPLES` artifact.
      name: (Optional) A `str` indicating the label to use for stats of
          `examples_id`.
      other_name: (Optional) A `str` indicating the label to use for stats of
          `other_examples_id`.
    """
        stats_artifact, other_stats_artifact = (
            self.get_dest_artifact_of_type(examples_id,
                                           TFXArtifactTypes.EXAMPLE_STATS),
            self.get_dest_artifact_of_type(other_examples_id,
                                           TFXArtifactTypes.EXAMPLE_STATS))
        if stats_artifact and other_stats_artifact:
            tfdv.visualize_statistics(tfdv.load_statistics(stats_artifact.uri),
                                      rhs_statistics=tfdv.load_statistics(
                                          other_stats_artifact.uri),
                                      lhs_name=name,
                                      rhs_name=other_name)
Beispiel #2
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.
        - exclude_splits: Names of splits that will not be taken into
          consideration when auto-generating a schema.

    Returns:
      None
    """
    # TODO(zhitaoli): Move constants between this file and component.py to a
    # constants.py.
    infer_feature_shape = exec_properties.get(INFER_FEATURE_SHAPE_KEY)

    # Load and deserialize exclude splits from execution properties.
    exclude_splits = json_utils.loads(
        exec_properties.get(EXCLUDE_SPLITS_KEY, 'null')) or []
    if not isinstance(exclude_splits, list):
      raise ValueError('exclude_splits in execution properties needs to be a '
                       'list. Got %s instead.' % type(exclude_splits))

    # Only one schema is generated for all splits.
    schema = None
    stats_artifact = artifact_utils.get_single_instance(
        input_dict[STATISTICS_KEY])
    for split in artifact_utils.decode_split_names(stats_artifact.split_names):
      if split in exclude_splits:
        continue

      logging.info('Processing schema from statistics for split %s.', split)
      stats_uri = io_utils.get_only_uri_in_dir(
          os.path.join(stats_artifact.uri, split))
      if not schema:
        schema = tfdv.infer_schema(
            tfdv.load_statistics(stats_uri), infer_feature_shape)
      else:
        schema = tfdv.update_schema(schema, tfdv.load_statistics(stats_uri),
                                    infer_feature_shape)

    output_uri = os.path.join(
        artifact_utils.get_single_uri(output_dict[SCHEMA_KEY]),
        _DEFAULT_FILE_NAME)
    io_utils.write_pbtxt_file(output_uri, schema)
    logging.info('Schema written to %s.', output_uri)
Beispiel #3
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.

    Returns:
      None
    """
        # TODO(zhitaoli): Move constants between this file and component.py to a
        # constants.py.
        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(input_dict['stats'], 'train'))
        output_uri = os.path.join(
            artifact_utils.get_single_uri(output_dict['output']),
            _DEFAULT_FILE_NAME)

        infer_feature_shape = exec_properties['infer_feature_shape']
        absl.logging.info('Infering schema from statistics.')
        schema = tfdv.infer_schema(tfdv.load_statistics(train_stats_uri),
                                   infer_feature_shape)
        io_utils.write_pbtxt_file(output_uri, schema)
        absl.logging.info('Schema written to %s.' % output_uri)
Beispiel #4
0
 def _validate_stats_output(self, stats_path):
     self.assertTrue(tf.io.gfile.exists(stats_path))
     stats = tfdv.load_statistics(stats_path)
     self.assertLen(stats.datasets, 1)
     data_set = stats.datasets[0]
     self.assertGreater(data_set.num_examples, 0)
     self.assertNotEmpty(data_set.features)
    def test_e2e(self, stats_options, expected_stats_pbtxt,
                 expected_schema_pbtxt):
        tfxio = tf_sequence_example_record.TFSequenceExampleRecord(
            self._input_file, ['tfdv', 'test'])
        stats_file = os.path.join(self._output_dir, 'stats')
        with beam.Pipeline() as p:
            _ = (p
                 | 'TFXIORead' >> tfxio.BeamSource()
                 | 'GenerateStats' >> tfdv.GenerateStatistics(stats_options)
                 | 'WriteStats' >> tfdv.WriteStatisticsToTFRecord(stats_file))

        actual_stats = tfdv.load_statistics(stats_file)
        test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self,
            text_format.Parse(
                expected_stats_pbtxt,
                statistics_pb2.DatasetFeatureStatisticsList()))([actual_stats])
        actual_schema = tfdv.infer_schema(actual_stats,
                                          infer_feature_shape=True)

        if hasattr(actual_schema, 'generate_legacy_feature_spec'):
            actual_schema.ClearField('generate_legacy_feature_spec')
        self._assert_schema_equal(
            actual_schema,
            text_format.Parse(expected_schema_pbtxt, schema_pb2.Schema()))
Beispiel #6
0
    def _provide_schema(self, input_dict,
                        exec_properties) -> schema_pb2.Schema:
        """Generates schema from either schema or statistics."""
        # TODO(zhitaoli): Move constants between this file and component.py to a
        # constants.py.
        stats = input_dict.get('stats') or input_dict.get('statistics')
        schema = input_dict.get('schema')

        if bool(stats) == bool(schema):
            raise ValueError(
                'Exactly only one of schema or stats must be provided')

        if schema:
            schema_uri = artifact_utils.get_single_uri(schema)
            absl.logging.info('Schema is provided. Reading from %s.' %
                              schema_uri)
            schema_reader = io_utils.SchemaReader()
            try:
                return schema_reader.read(
                    os.path.join(schema_uri, _DEFAULT_FILE_NAME))

            except tf.errors.NotFoundError:
                raise ValueError(
                    'Schema is provided, but failed to read from %s.' %
                    schema_uri)

        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(stats, 'train'))
        infer_feature_shape = exec_properties['infer_feature_shape']
        return tfdv.infer_schema(tfdv.load_statistics(train_stats_uri),
                                 infer_feature_shape)
Beispiel #7
0
 def _validate_stats_output(self, stats_path):
     self.assertTrue(tf.gfile.Exists(stats_path))
     stats = tfdv.load_statistics(stats_path)
     self.assertEqual(1, len(stats.datasets))
     data_set = stats.datasets[0]
     self.assertGreater(data_set.num_examples, 0)
     self.assertNotEqual(0, len(data_set.features))
Beispiel #8
0
 def _validate_stats_output(self, stats_path):
   self.assertTrue(tf.gfile.Exists(stats_path))
   stats = tfdv.load_statistics(stats_path)
   self.assertEqual(1, len(stats.datasets))
   data_set = stats.datasets[0]
   self.assertGreater(data_set.num_examples, 0)
   self.assertNotEqual(0, len(data_set.features))
Beispiel #9
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        self._log_startup(input_dict, output_dict, exec_properties)
        logging.info('Validating schema against the computed statistics.')

        split_uris: List[Text] = []
        for artifact in input_dict[executor.STATISTICS_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                split_uris.append(split)

        label_inputs = {
            labels.STATS:
            tfdv.load_statistics(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_split_uri(
                        input_dict[executor.STATISTICS_KEY], split_uris[0]))),
            labels.SCHEMA:
            io_utils.SchemaReader().read(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(
                        input_dict[executor.SCHEMA_KEY])))
        }
        output_uri = artifact_utils.get_single_uri(
            output_dict[executor.ANOMALIES_KEY])
        label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri}
        self._Validate(label_inputs, label_outputs)
        logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
Beispiel #10
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'train'. Stats on other splits are ignored.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'SchemaPath' artifact of size one.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    # TODO(zhitaoli): Move constants between this file and component.py to a
    # constants.py.
    train_stats_uri = io_utils.get_only_uri_in_dir(
        types.get_split_uri(input_dict['stats'], 'train'))
    output_uri = os.path.join(
        types.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME)

    infer_feature_shape = False
    tf.logging.info('Infering schema from statistics.')
    schema = tfdv.infer_schema(
        tfdv.load_statistics(train_stats_uri), infer_feature_shape)
    io_utils.write_pbtxt_file(output_uri, schema)
    tf.logging.info('Schema written to {}.'.format(output_uri))
Beispiel #11
0
    def Do(self, input_dict, output_dict, exec_properties):
        """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        tf.logging.info('Validating schema against the computed statistics.')
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                types.get_single_uri(input_dict['schema'])))
        stats = tfdv.load_statistics(
            io_utils.get_only_uri_in_dir(
                types.get_split_uri(input_dict['stats'], 'eval')))
        output_uri = types.get_single_uri(output_dict['output'])
        anomalies = tfdv.validate_statistics(stats, schema)
        io_utils.write_pbtxt_file(os.path.join(output_uri, DEFAULT_FILE_NAME),
                                  anomalies)
        tf.logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
Beispiel #12
0
def parse_statistics(split_name: Text,
                     statistics: List[Artifact]) -> Dict[Text, int]:
    stats_uri = io_utils.get_only_uri_in_dir(
        artifact_utils.get_split_uri(statistics, split_name))

    stats = tfdv.load_statistics(stats_uri)

    return stats
Beispiel #13
0
    def display_stats_for_examples(self, examples_id):
        """Displays stats for `examples_id`.

    Args:
      examples_id: A `int` indicating the id of a `TFXArtifactTypes.EXAMPLES`
          artifact.
    """
        stats_artifact = self.get_dest_artifact_of_type(
            examples_id, TFXArtifactTypes.EXAMPLE_STATS)
        if stats_artifact:
            tfdv.visualize_statistics(
                tfdv.load_statistics(
                    os.path.join(stats_artifact.uri, 'stats_tfrecord')))
            print("display shema")
            tfdv.display_schema(
                tfdv.infer_schema(statistics=tfdv.load_statistics(
                    os.path.join(stats_artifact.uri, 'stats_tfrecord'))))
Beispiel #14
0
 def display(self, artifact: types.Artifact):
     from IPython.core.display import display  # pylint: disable=g-import-not-at-top
     from IPython.core.display import HTML  # pylint: disable=g-import-not-at-top
     for split in artifact_utils.decode_split_names(artifact.split_names):
         display(HTML('<div><b>%r split:</b></div><br/>' % split))
         stats_path = os.path.join(artifact.uri, split, 'stats_tfrecord')
         stats = tfdv.load_statistics(stats_path)
         tfdv.visualize_statistics(stats)
Beispiel #15
0
 def validate_stats_against_schema(self):  # type: () -> bool
     stats = tfdv.load_statistics(self.stats_path)
     self.anomalies = tfdv.validate_statistics(stats, self.schema)
     if len(self.anomalies.anomaly_info.items()) > 0:
         logger.error("Anomalies found in training dataset...")
         logger.error(str(self.anomalies.anomaly_info.items()))
         self.upload_anomalies()
         return False
     else:
         logger.info("No anomalies found")
         return True
Beispiel #16
0
    def parse_schema_from_stats(cls, stats_path):
        # type: (str) -> Tuple[Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]], Schema]  # noqa: E501
        """
        Returns TensorFlow Feature Spec and parsed tf.metadata Schema for given tf.metadata
        DatasetFeatureStatisticsList.

        :param stats_path: tf.metadata DatasetFeatureStatisticsList path
        """
        import tensorflow_data_validation as tfdv
        stats = tfdv.load_statistics(stats_path)
        schema = tfdv.infer_schema(stats)
        return schema_to_feature_spec(schema), schema
def infer_schema(stats_path, schema_path):
    """Infers a schema from stats in stats_path.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location where the inferred schema is materialized.
  """
    print('Infering schema from statistics.')
    schema = tfdv.infer_schema(tfdv.load_statistics(stats_path),
                               infer_feature_shape=False)
    print(text_format.MessageToString(schema))
    print('## Statistics ##')
    stats = tfdv.load_statistics(stats_path)
    # print(text_format.MessageToString(stats))
    for d in stats.datasets:
        for f in d.features:
            print(f.name)

    print('Writing schema to output path.')
    file_io.write_string_to_file(schema_path,
                                 text_format.MessageToString(schema))
Beispiel #18
0
 def display(self, artifact: types.Artifact):
   from IPython.core.display import display  # pylint: disable=g-import-not-at-top
   from IPython.core.display import HTML  # pylint: disable=g-import-not-at-top
   for split in artifact_utils.decode_split_names(artifact.split_names):
     display(HTML('<div><b>%r split:</b></div><br/>' % split))
     stats_path = io_utils.get_only_uri_in_dir(
         artifact_utils.get_split_uri([artifact], split))
     if artifact_utils.is_artifact_version_older_than(
         artifact, artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE):  # pylint: disable=protected-access
       stats = tfdv.load_statistics(stats_path)
     else:
       stats = tfdv.load_stats_binary(stats_path)
     tfdv.visualize_statistics(stats)
Beispiel #19
0
def infer_schema(stats_path, schema_path):
    """Infers a schema from stats in stats_path.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location where the inferred schema is materialized.
  """
    print('Infering schema from statistics.')
    schema = tfdv.infer_schema(tfdv.load_statistics(stats_path),
                               infer_feature_shape=False)
    print(text_format.MessageToString(schema))

    print('Writing schema to output path.')
    tfdv.write_schema_text(schema, schema_path)
Beispiel #20
0
def infer_schema(stats_path, schema_path):
    """Infers a schema from stats in stats_path.

    Args:
      stats_path: Location of the stats used to infer the schema.
      schema_path: Location where the inferred schema is materialized.
    """
    # Infering schema from statistics
    schema = tfdv.infer_schema(tfdv.load_statistics(stats_path),
                               infer_feature_shape=False)

    # Writing schema to output path
    file_io.write_string_to_file(schema_path,
                                 text_format.MessageToString(schema))
Beispiel #21
0
    def display_stats_for_examples(self, examples_id, split='train'):
        """Displays stats for `examples_id`.

    Args:
      examples_id: A `int` indicating the id of a `TFXArtifactTypes.EXAMPLES`
        artifact.
      split: A `string` specifying the split name, by default 'train' is used.
    """
        stats_artifact = self.get_dest_artifact_of_type(
            examples_id, TFXArtifactTypes.EXAMPLE_STATS)
        if stats_artifact:
            tfdv.visualize_statistics(
                tfdv.load_statistics(
                    os.path.join(stats_artifact.uri, split, 'stats_tfrecord')))
def infer_schema(stats_path, schema_path):
  """Infers a schema from stats in stats_path.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location where the inferred schema is materialized.
  """
  print('Infering schema from statistics.')
  schema = tfdv.infer_schema(
      tfdv.load_statistics(stats_path), infer_feature_shape=False)
  print(text_format.MessageToString(schema))

  print('Writing schema to output path.')
  file_io.write_string_to_file(schema_path, text_format.MessageToString(schema))
def tfdv_detect_drift(
    stats_older_path: str, stats_new_path: str
) -> NamedTuple('Outputs', [('drift', str)]):

  import logging
  import time

  import tensorflow_data_validation as tfdv
  import tensorflow_data_validation.statistics.stats_impl

  logging.getLogger().setLevel(logging.INFO)
  logging.info('stats_older_path: %s', stats_older_path)
  logging.info('stats_new_path: %s', stats_new_path)

  if stats_older_path == 'none':
    return ('true', )

  stats1 = tfdv.load_statistics(stats_older_path)
  stats2 = tfdv.load_statistics(stats_new_path)

  schema1 = tfdv.infer_schema(statistics=stats1)
  tfdv.get_feature(schema1, 'duration').drift_comparator.jensen_shannon_divergence.threshold = 0.01
  drift_anomalies = tfdv.validate_statistics(
      statistics=stats2, schema=schema1, previous_statistics=stats1)
  logging.info('drift analysis results: %s', drift_anomalies.drift_skew_info)

  from google.protobuf.json_format import MessageToDict
  d = MessageToDict(drift_anomalies)
  val = d['driftSkewInfo'][0]['driftMeasurements'][0]['value']
  thresh = d['driftSkewInfo'][0]['driftMeasurements'][0]['threshold']
  logging.info('value %s and threshold %s', val, thresh)
  res = 'true'
  if val < thresh:
    res = 'false'
  logging.info('train decision: %s', res)
  return (res, )
Beispiel #24
0
def validate_stats(stats_path, schema_path, anomalies_path):
    """Validates the statistics against the schema and materializes anomalies.

    Args:
      stats_path: Location of the stats used to infer the schema.
      schema_path: Location of the schema to be used for validation.
      anomalies_path: Location where the detected anomalies are materialized.
    """
    # Validating schema against the computed statistics
    schema = my_metadata.read_schema(schema_path)

    stats = tfdv.load_statistics(stats_path)
    anomalies = tfdv.validate_statistics(stats, schema)

    # Writing anomalies to anomalies path to
    file_io.write_string_to_file(anomalies_path,
                                 text_format.MessageToString(anomalies))
Beispiel #25
0
def validate_stats(stats_path, schema_path, anomalies_path):
    """Validates the statistics against the schema and materializes anomalies.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location of the schema to be used for validation.
    anomalies_path: Location where the detected anomalies are materialized.
  """
    print('Validating schema against the computed statistics.')
    schema = tfdv.load_schema_text(schema_path)
    stats = tfdv.load_statistics(stats_path)
    anomalies = tfdv.validate_statistics(stats, schema)
    print('Detected following anomalies:')
    print(text_format.MessageToString(anomalies))

    print('Writing anomalies to anomalies path.')
    file_io.write_string_to_file(anomalies_path,
                                 text_format.MessageToString(anomalies))
def validate_stats(stats_path, schema_path, anomalies_path):
  """Validates the statistics against the schema and materializes anomalies.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location of the schema to be used for validation.
    anomalies_path: Location where the detected anomalies are materialized.
  """
  print('Validating schema against the computed statistics.')
  schema = taxi.read_schema(schema_path)

  stats = tfdv.load_statistics(stats_path)
  anomalies = tfdv.validate_statistics(stats, schema)
  print('Detected following anomalies:')
  print(text_format.MessageToString(anomalies))

  print('Writing anomalies to anomalies path.')
  file_io.write_string_to_file(anomalies_path,
                               text_format.MessageToString(anomalies))
Beispiel #27
0
def group_stats_and_examples(
    input_dict: Dict[Text, List[types.Artifact]]
) -> List[Tuple[types.Artifact, Dict[Text, DatasetFeatureStatistics]]]:
    result = []
    examples_list = input_dict[EXAMPLES_KEY]
    if len(examples_list) > 1:
        raise ValueError('only one examples artifact expected')

    examples = examples_list[0]
    group = {}
    for split in artifact_utils.decode_split_names(examples.split_names):
        statistics = tfdv.load_statistics(
            io_utils.get_only_uri_in_dir(
                types.artifact_utils.get_split_uri(input_dict[STATISTICS_KEY],
                                                   split)))
        if len(statistics.datasets) != 1:
            raise ValueError('one statistics set expected')
        group[split] = statistics.datasets[0]
    result.append((examples, group))
    return result
Beispiel #28
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of type `standard_artifacts.ExampleStatistics` which
          should contain the 'eval' split. Stats on other splits are ignored.
        - schema: A list of type `standard_artifacts.Schema` which should
          contain a single schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        absl.logging.info('Validating schema against the computed statistics.')
        label_inputs = {
            labels.STATS:
            tfdv.load_statistics(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_split_uri(input_dict[STATISTICS_KEY],
                                                 'eval'))),
            labels.SCHEMA:
            io_utils.SchemaReader().read(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
        }
        output_uri = artifact_utils.get_single_uri(output_dict[ANOMALIES_KEY])
        label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri}
        self._Validate(label_inputs, label_outputs)
        absl.logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
Beispiel #29
0
 def validate_stats_against_schema(self,
                                   environment=None,
                                   previous_statistics=None,
                                   serving_statistics=None,
                                   ):
     # type: (str, DatasetFeatureStatisticsList, DatasetFeatureStatisticsList) -> bool
     stats = tfdv.load_statistics(self.stats_path)
     self.anomalies = tfdv.validate_statistics(
         stats,
         self.schema,
         environment=environment,
         previous_statistics=previous_statistics,
         serving_statistics=serving_statistics,
     )
     if len(self.anomalies.anomaly_info.items()) > 0:
         logger.error("Anomalies found in training dataset...")
         logger.error(str(self.anomalies.anomaly_info.items()))
         self.upload_anomalies()
         return False
     else:
         logger.info("No anomalies found")
         return True
Beispiel #30
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    tf.logging.info('Validating schema against the computed statistics.')
    schema = io_utils.SchemaReader().read(
        io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema'])))
    stats = tfdv.load_statistics(
        io_utils.get_only_uri_in_dir(
            types.get_split_uri(input_dict['stats'], 'eval')))
    output_uri = types.get_single_uri(output_dict['output'])
    anomalies = tfdv.validate_statistics(stats, schema)
    io_utils.write_pbtxt_file(
        os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies)
    tf.logging.info(
        'Validation complete. Anomalies written to {}.'.format(output_uri))
Beispiel #31
0
    def Do(self, input_dict: Dict[str, List[types.Artifact]],
           output_dict: Dict[str, List[types.Artifact]],
           exec_properties: Dict[str, Any]) -> None:
        """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'statistics': A list of 'ExampleStatistics' type which must contain
          split 'train'.
      output_dict: Output dict from key to a list of artifacts, including:
        - schema: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.
        - exclude_splits: Names of splits that will not be taken into
          consideration when auto-generating a schema.

    Returns:
      None
    """
        infer_feature_shape = bool(
            exec_properties.get(
                standard_component_specs.INFER_FEATURE_SHAPE_KEY, True))

        # Load and deserialize exclude splits from execution properties.
        exclude_splits = json_utils.loads(
            exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY,
                                'null')) or []
        if not isinstance(exclude_splits, list):
            raise ValueError(
                'exclude_splits in execution properties needs to be a '
                'list. Got %s instead.' % type(exclude_splits))

        # Only one schema is generated for all splits.
        schema = None
        stats_artifact = artifact_utils.get_single_instance(
            input_dict[standard_component_specs.STATISTICS_KEY])
        for split in artifact_utils.decode_split_names(
                stats_artifact.split_names):
            if split in exclude_splits:
                continue

            logging.info('Processing schema from statistics for split %s.',
                         split)
            stats_uri = io_utils.get_only_uri_in_dir(
                artifact_utils.get_split_uri([stats_artifact], split))
            if artifact_utils.is_artifact_version_older_than(
                    stats_artifact,
                    artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE):  # pylint: disable=protected-access
                stats = tfdv.load_statistics(stats_uri)
            else:
                stats = tfdv.load_stats_binary(stats_uri)
            if not schema:
                schema = tfdv.infer_schema(stats, infer_feature_shape)
            else:
                schema = tfdv.update_schema(schema, stats, infer_feature_shape)

        output_uri = os.path.join(
            artifact_utils.get_single_uri(
                output_dict[standard_component_specs.SCHEMA_KEY]),
            DEFAULT_FILE_NAME)
        io_utils.write_pbtxt_file(output_uri, schema)
        logging.info('Schema written to %s.', output_uri)
Beispiel #32
0
    def Do(self, input_dict: Dict[Text, List[Artifact]],
           output_dict: Dict[Text, List[Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Generate MetaFeatures for meta training datasets.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - statistics: output from StatisticsGen component.
      output_dict: Output dict from key to a list of artifacts.
      exec_properties: A dict of execution properties
    """

        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(input_dict[STATISTICS_KEY], 'train'))

        stats = tfdv.load_statistics(train_stats_uri)

        if len(stats.datasets) != 1:
            raise ValueError(
                'DatasetFeatureStatisticsList proto contains multiple datasets. Only '
                'one dataset is currently supported.')
        stats = stats.datasets[0]

        num_float_features = 0
        num_int_features = 0
        num_categorical_features = 0
        for feature in stats.features:

            name = feature.name

            # For structured fields, name is set by path and is not in the name
            # attribute.
            if not name:
                name = feature.path.step[0]
            logging.info('Feature Name: %s', name)

            if statistics_pb2.FeatureNameStatistics.FLOAT == feature.type:
                num_float_features += 1
            elif statistics_pb2.FeatureNameStatistics.INT == feature.type:
                num_int_features += 1
            else:
                num_categorical_features += 1

        metafeature_dict = {
            'num_examples': stats.num_examples,
            'num_int_features': num_int_features,
            'num_float_features': num_float_features,
            'num_categorical_features': num_categorical_features,
        }

        metafeature_dict['metafeature'] = [
            stats.num_examples, num_int_features, num_float_features,
            num_categorical_features
        ]

        metafeature_path = os.path.join(
            artifact_utils.get_single_uri(output_dict[METAFEATURES_KEY]),
            artifacts.MetaFeatures.DEFAULT_FILE_NAME)

        io_utils.write_string_file(metafeature_path,
                                   json.dumps(metafeature_dict))
        logging.info('MetaFeature saved at %s', metafeature_path)
 def display(self, artifact: types.Artifact):
   stats_path = os.path.join(artifact.uri, 'stats_tfrecord')
   stats = tfdv.load_statistics(stats_path)
   tfdv.visualize_statistics(stats)
Beispiel #34
0
    def Do(self, input_dict: Dict[str, List[types.Artifact]],
           output_dict: Dict[str, List[types.Artifact]],
           exec_properties: Dict[str, Any]) -> None:
        """TensorFlow ExampleValidator executor entrypoint.

    This validates statistics against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - statistics: A list of type `standard_artifacts.ExampleStatistics`
          generated by StatisticsGen.
        - schema: A list of type `standard_artifacts.Schema` which should
          contain a single schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'standard_artifacts.ExampleAnomalies' of size one.
          It will include a single binary proto file which contains all
          anomalies found.
      exec_properties: A dict of execution properties.
        - exclude_splits: JSON-serialized list of names of splits that the
          example validator should not validate.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # Load and deserialize exclude splits from execution properties.
        exclude_splits = json_utils.loads(
            exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY,
                                'null')) or []
        if not isinstance(exclude_splits, list):
            raise ValueError(
                'exclude_splits in execution properties needs to be a '
                'list. Got %s instead.' % type(exclude_splits))
        # Setup output splits.
        stats_artifact = artifact_utils.get_single_instance(
            input_dict[standard_component_specs.STATISTICS_KEY])
        stats_split_names = artifact_utils.decode_split_names(
            stats_artifact.split_names)
        split_names = [
            split for split in stats_split_names if split not in exclude_splits
        ]
        anomalies_artifact = artifact_utils.get_single_instance(
            output_dict[standard_component_specs.ANOMALIES_KEY])
        anomalies_artifact.split_names = artifact_utils.encode_split_names(
            split_names)

        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(
                    input_dict[standard_component_specs.SCHEMA_KEY])))

        for split in artifact_utils.decode_split_names(
                stats_artifact.split_names):
            if split in exclude_splits:
                continue

            logging.info(
                'Validating schema against the computed statistics for '
                'split %s.', split)
            stats_uri = io_utils.get_only_uri_in_dir(
                artifact_utils.get_split_uri([stats_artifact], split))
            if artifact_utils.is_artifact_version_older_than(
                    stats_artifact,
                    artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE):  # pylint: disable=protected-access
                stats = tfdv.load_statistics(stats_uri)
            else:
                stats = tfdv.load_stats_binary(stats_uri)
            label_inputs = {
                standard_component_specs.STATISTICS_KEY: stats,
                standard_component_specs.SCHEMA_KEY: schema
            }
            output_uri = artifact_utils.get_split_uri(
                output_dict[standard_component_specs.ANOMALIES_KEY], split)
            label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri}
            self._Validate(label_inputs, label_outputs)
            logging.info(
                'Validation complete for split %s. Anomalies written to '
                '%s.', split, output_uri)