Beispiel #1
0
    def __init__(self,
                 statistics: types.Channel = None,
                 schema: types.Channel = None,
                 output: Optional[types.Channel] = None,
                 stats: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Construct an ExampleValidator component.

    Args:
      statistics: A Channel of 'ExampleStatisticsPath` type. This should contain
        at least 'eval' split. Other splits are ignored currently.
      schema: A Channel of "SchemaPath' type. _required_
      output: Output channel of 'ExampleValidationPath' type.
      stats: Backwards compatibility alias for the 'statistics' argument.
      instance_name: Optional name assigned to this specific instance of
        ExampleValidator. Required only if multiple ExampleValidator components
        are declared in the same pipeline.

    Either `stats` or `statistics` must be present in the arguments.
    """
        statistics = statistics or stats
        output = output or types.Channel(
            type=standard_artifacts.ExampleAnomalies,
            artifacts=[standard_artifacts.ExampleAnomalies()])
        spec = ExampleValidatorSpec(stats=statistics,
                                    schema=schema,
                                    output=output)
        super(ExampleValidator, self).__init__(spec=spec,
                                               instance_name=instance_name)
Beispiel #2
0
    def __init__(self,
                 statistics: types.Channel = None,
                 schema: types.Channel = None,
                 output: Optional[types.Channel] = None,
                 stats: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Construct an ExampleValidator component.

    Args:
      statistics: A Channel of type `standard_artifacts.ExampleStatistics`. This
        should contain at least 'eval' split. Other splits are currently
        ignored.
      schema: A Channel of type `standard_artifacts.Schema`. _required_
      output: Output channel of type `standard_artifacts.ExampleAnomalies`.
      stats: Backwards compatibility alias for the 'statistics' argument.
      instance_name: Optional name assigned to this specific instance of
        ExampleValidator. Required only if multiple ExampleValidator components
        are declared in the same pipeline.  Either `stats` or `statistics` must
        be present in the arguments.
    """
        if stats:
            absl.logging.warning(
                'The "stats" argument to the StatisticsGen component has '
                'been renamed to "statistics" and is deprecated. Please update your '
                'usage as support for this argument will be removed soon.')
            statistics = stats
        anomalies = output or types.Channel(
            type=standard_artifacts.ExampleAnomalies,
            artifacts=[standard_artifacts.ExampleAnomalies()])
        spec = ExampleValidatorSpec(statistics=statistics,
                                    schema=schema,
                                    anomalies=anomalies)
        super(ExampleValidator, self).__init__(spec=spec,
                                               instance_name=instance_name)
Beispiel #3
0
    def testDo(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        eval_stats_artifact = standard_artifacts.ExampleStatistics()
        eval_stats_artifact.uri = os.path.join(source_data_dir,
                                               'statistics_gen')
        eval_stats_artifact.split_names = artifact_utils.encode_split_names(
            ['train', 'eval', 'test'])

        schema_artifact = standard_artifacts.Schema()
        schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen')

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        validation_output = standard_artifacts.ExampleAnomalies()
        validation_output.uri = os.path.join(output_data_dir, 'output')

        input_dict = {
            STATISTICS_KEY: [eval_stats_artifact],
            SCHEMA_KEY: [schema_artifact],
        }

        exec_properties = {
            # List needs to be serialized before being passed into Do function.
            EXCLUDE_SPLITS_KEY: json_utils.dumps(['test'])
        }

        output_dict = {
            ANOMALIES_KEY: [validation_output],
        }

        example_validator_executor = executor.Executor()
        example_validator_executor.Do(input_dict, output_dict, exec_properties)

        self.assertEqual(artifact_utils.encode_split_names(['train', 'eval']),
                         validation_output.split_names)

        # Check example_validator outputs.
        train_anomalies_path = os.path.join(validation_output.uri,
                                            'Split-train', 'SchemaDiff.pb')
        eval_anomalies_path = os.path.join(validation_output.uri, 'Split-eval',
                                           'SchemaDiff.pb')
        self.assertTrue(fileio.exists(train_anomalies_path))
        self.assertTrue(fileio.exists(eval_anomalies_path))
        train_anomalies_bytes = io_utils.read_bytes_file(train_anomalies_path)
        train_anomalies = anomalies_pb2.Anomalies()
        train_anomalies.ParseFromString(train_anomalies_bytes)
        eval_anomalies_bytes = io_utils.read_bytes_file(eval_anomalies_path)
        eval_anomalies = anomalies_pb2.Anomalies()
        eval_anomalies.ParseFromString(eval_anomalies_bytes)
        self.assertEqual(0, len(train_anomalies.anomaly_info))
        self.assertEqual(0, len(eval_anomalies.anomaly_info))

        # Assert 'test' split is excluded.
        train_file_path = os.path.join(validation_output.uri, 'Split-test',
                                       'SchemaDiff.pb')
        self.assertFalse(fileio.exists(train_file_path))
Beispiel #4
0
    def __init__(self,
                 statistics: types.Channel = None,
                 schema: types.Channel = None,
                 output: Optional[types.Channel] = None,
                 stats: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """An ExampleValidator component for examples.

     TFX has its ExampleValidator component, and this one uses the same
     executor. The reason for this one to exist, is that the TFX
     component does not allow to specify the splits to use, it just
     assumes `train` and `eval`. This component will be unnecessary
     once TFX Transform allows to set the input and output splits
     as other components do"""

        if stats:
            logging.warning(
                'The "stats" argument to the StatisticsGen component has '
                'been renamed to "statistics" and is deprecated. Please update'
                ' your usage as support for this argument will be removed'
                ' soon.')
            statistics = stats
        anomalies = output or types.Channel(
            type=standard_artifacts.ExampleAnomalies,
            artifacts=[standard_artifacts.ExampleAnomalies()])
        spec = ExampleValidatorSpec(statistics=statistics,
                                    schema=schema,
                                    anomalies=anomalies)
        super(ExampleValidator, self).__init__(spec=spec,
                                               instance_name=instance_name)
Beispiel #5
0
 def testShow(self, *unused_mocks):
     context = interactive_context.InteractiveContext()
     mock_object = mock.MagicMock()
     standard_visualizations.ExampleAnomaliesVisualization.display = mock_object
     mock_object.assert_not_called()
     artifact = standard_artifacts.ExampleAnomalies()
     context.show(
         types.Channel(type=standard_artifacts.ExampleAnomalies,
                       artifacts=[artifact]))
     mock_object.assert_called_with(artifact)
Beispiel #6
0
  def __init__(self,
               statistics: types.Channel = None,
               schema: types.Channel = None,
               exclude_splits: Optional[List[Text]] = None,
               output: Optional[types.Channel] = None,
               stats: Optional[types.Channel] = None,
               instance_name: Optional[Text] = None):
    """Construct an ExampleValidator component.

    Args:
      statistics: A Channel of type `standard_artifacts.ExampleStatistics`. This
        should contain at least 'eval' split. Other splits are currently
        ignored.
      schema: A Channel of type `standard_artifacts.Schema`. _required_
      exclude_splits: Names of splits that the example validator should not
        validate. Default behavior (when exclude_splits is set to None)
        is excluding no splits.
      output: Output channel of type `standard_artifacts.ExampleAnomalies`.
      stats: Backwards compatibility alias for the 'statistics' argument.
      instance_name: Optional name assigned to this specific instance of
        ExampleValidator. Required only if multiple ExampleValidator components
        are declared in the same pipeline.  Either `stats` or `statistics` must
        be present in the arguments.
    """
    if stats:
      logging.warning(
          'The "stats" argument to the StatisticsGen component has '
          'been renamed to "statistics" and is deprecated. Please update your '
          'usage as support for this argument will be removed soon.')
      statistics = stats
    if exclude_splits is None:
      exclude_splits = []
      logging.info('Excluding no splits because exclude_splits is not set.')
    anomalies = output
    if not anomalies:
      anomalies_artifact = standard_artifacts.ExampleAnomalies()
      statistics_split_names = artifact_utils.decode_split_names(
          artifact_utils.get_single_instance(list(
              statistics.get())).split_names)
      split_names = [
          split for split in statistics_split_names
          if split not in exclude_splits
      ]
      anomalies_artifact.split_names = artifact_utils.encode_split_names(
          split_names)
      anomalies = types.Channel(
          type=standard_artifacts.ExampleAnomalies,
          artifacts=[anomalies_artifact])
    spec = ExampleValidatorSpec(
        statistics=statistics,
        schema=schema,
        exclude_splits=json_utils.dumps(exclude_splits),
        anomalies=anomalies)
    super(ExampleValidator, self).__init__(
        spec=spec, instance_name=instance_name)
Beispiel #7
0
  def testGetStatusOutputPathsEntries(self):
    # disabled.
    self.assertEmpty(executor_utils.GetStatsOutputPathEntries(True, {}))

    # enabled.
    pre_transform_stats = standard_artifacts.ExampleStatistics()
    pre_transform_stats.uri = '/pre_transform_stats'
    pre_transform_schema = standard_artifacts.Schema()
    pre_transform_schema.uri = '/pre_transform_schema'
    post_transform_anomalies = standard_artifacts.ExampleAnomalies()
    post_transform_anomalies.uri = '/post_transform_anomalies'
    post_transform_stats = standard_artifacts.ExampleStatistics()
    post_transform_stats.uri = '/post_transform_stats'
    post_transform_schema = standard_artifacts.Schema()
    post_transform_schema.uri = '/post_transform_schema'

    result = executor_utils.GetStatsOutputPathEntries(
        False, {
            standard_component_specs.PRE_TRANSFORM_STATS_KEY:
                [pre_transform_stats],
            standard_component_specs.PRE_TRANSFORM_SCHEMA_KEY:
                [pre_transform_schema],
            standard_component_specs.POST_TRANSFORM_ANOMALIES_KEY:
                [post_transform_anomalies],
            standard_component_specs.POST_TRANSFORM_STATS_KEY:
                [post_transform_stats],
            standard_component_specs.POST_TRANSFORM_SCHEMA_KEY:
                [post_transform_schema],
        })
    self.assertEqual(
        {
            labels.PRE_TRANSFORM_OUTPUT_STATS_PATH_LABEL:
                '/pre_transform_stats',
            labels.PRE_TRANSFORM_OUTPUT_SCHEMA_PATH_LABEL:
                '/pre_transform_schema',
            labels.POST_TRANSFORM_OUTPUT_ANOMALIES_PATH_LABEL:
                '/post_transform_anomalies',
            labels.POST_TRANSFORM_OUTPUT_STATS_PATH_LABEL:
                '/post_transform_stats',
            labels.POST_TRANSFORM_OUTPUT_SCHEMA_PATH_LABEL:
                '/post_transform_schema',
        }, result)
Beispiel #8
0
    def __init__(self,
                 stats: types.Channel,
                 schema: types.Channel,
                 output: Optional[types.Channel] = None,
                 name: Optional[Text] = None):
        """Construct an ExampleValidator component.

    Args:
      stats: A Channel of 'ExampleStatisticsPath' type. This should contain at
        least 'eval' split. Other splits are ignored currently.
      schema: A Channel of "SchemaPath' type.
      output: Optional output channel of 'ExampleValidationPath' type.
      name: Optional unique name. Necessary iff multiple ExampleValidator
        components are declared in the same pipeline.
    """
        output = output or types.Channel(
            type=standard_artifacts.ExampleAnomalies,
            artifacts=[standard_artifacts.ExampleAnomalies()])
        spec = ExampleValidatorSpec(stats=stats, schema=schema, output=output)
        super(ExampleValidator, self).__init__(spec=spec, name=name)
Beispiel #9
0
    def testDo(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        eval_stats_artifact = standard_artifacts.ExampleStatistics()
        eval_stats_artifact.uri = os.path.join(source_data_dir,
                                               'statistics_gen')
        eval_stats_artifact.split_names = artifact_utils.encode_split_names(
            ['eval'])

        schema_artifact = standard_artifacts.Schema()
        schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen')

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        validation_output = standard_artifacts.ExampleAnomalies()
        validation_output.uri = os.path.join(output_data_dir, 'output')

        input_dict = {
            executor.STATISTICS_KEY: [eval_stats_artifact],
            executor.SCHEMA_KEY: [schema_artifact],
        }
        output_dict = {
            executor.ANOMALIES_KEY: [validation_output],
        }

        exec_properties = {}

        example_validator_executor = executor.Executor()
        example_validator_executor.Do(input_dict, output_dict, exec_properties)
        self.assertEqual(['anomalies.pbtxt'],
                         tf.io.gfile.listdir(validation_output.uri))
        anomalies = io_utils.parse_pbtxt_file(
            os.path.join(validation_output.uri, 'anomalies.pbtxt'),
            anomalies_pb2.Anomalies())
        self.assertNotEqual(0, len(anomalies.anomaly_info))
Beispiel #10
0
  def testDo(self):
    source_data_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')

    eval_stats_artifact = types.Artifact('ExampleStatsPath', split='eval')
    eval_stats_artifact.uri = os.path.join(source_data_dir,
                                           'statistics_gen/eval/')

    schema_artifact = standard_artifacts.Schema()
    schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen/')

    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    validation_output = standard_artifacts.ExampleAnomalies()
    validation_output.uri = os.path.join(output_data_dir, 'output')

    input_dict = {
        'stats': [eval_stats_artifact],
        'schema': [schema_artifact],
    }
    output_dict = {
        'output': [validation_output],
    }

    exec_properties = {}

    example_validator_executor = executor.Executor()
    example_validator_executor.Do(input_dict, output_dict, exec_properties)
    self.assertEqual(['anomalies.pbtxt'],
                     tf.gfile.ListDirectory(validation_output.uri))
    anomalies = io_utils.parse_pbtxt_file(
        os.path.join(validation_output.uri, 'anomalies.pbtxt'),
        anomalies_pb2.Anomalies())
    self.assertNotEqual(0, len(anomalies.anomaly_info))
Beispiel #11
0
    def _make_base_do_params(self, source_data_dir, output_data_dir):
        # Create input dict.
        example1 = standard_artifacts.Examples()
        example1.uri = self._ARTIFACT1_URI
        example1.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])
        example2 = copy.deepcopy(example1)
        example2.uri = self._ARTIFACT2_URI

        self._example_artifacts = [example1, example2]

        schema_artifact = standard_artifacts.Schema()
        schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen')

        self._input_dict = {
            standard_component_specs.EXAMPLES_KEY: self._example_artifacts[:1],
            standard_component_specs.SCHEMA_KEY: [schema_artifact],
        }

        # Create output dict.
        self._transformed_output = standard_artifacts.TransformGraph()
        self._transformed_output.uri = os.path.join(output_data_dir,
                                                    'transformed_graph')
        transformed1 = standard_artifacts.Examples()
        transformed1.uri = os.path.join(output_data_dir,
                                        'transformed_examples', '0')
        transformed2 = standard_artifacts.Examples()
        transformed2.uri = os.path.join(output_data_dir,
                                        'transformed_examples', '1')

        self._transformed_example_artifacts = [transformed1, transformed2]

        temp_path_output = _TempPath()
        temp_path_output.uri = tempfile.mkdtemp()
        self._updated_analyzer_cache_artifact = standard_artifacts.TransformCache(
        )
        self._updated_analyzer_cache_artifact.uri = os.path.join(
            self._output_data_dir, 'CACHE')

        self._pre_transform_schema = standard_artifacts.Schema()
        self._pre_transform_schema.uri = os.path.join(output_data_dir,
                                                      'pre_transform_schema',
                                                      '0')
        self._pre_transform_stats = standard_artifacts.ExampleStatistics()
        self._pre_transform_stats.uri = os.path.join(output_data_dir,
                                                     'pre_transform_stats',
                                                     '0')
        self._post_transform_schema = standard_artifacts.Schema()
        self._post_transform_schema.uri = os.path.join(
            output_data_dir, 'post_transform_schema', '0')
        self._post_transform_stats = standard_artifacts.ExampleStatistics()
        self._post_transform_stats.uri = os.path.join(output_data_dir,
                                                      'post_transform_stats',
                                                      '0')
        self._post_transform_anomalies = standard_artifacts.ExampleAnomalies()
        self._post_transform_anomalies.uri = os.path.join(
            output_data_dir, 'post_transform_anomalies', '0')

        self._output_dict = {
            standard_component_specs.TRANSFORM_GRAPH_KEY:
            [self._transformed_output],
            standard_component_specs.TRANSFORMED_EXAMPLES_KEY:
            self._transformed_example_artifacts[:1],
            executor.TEMP_PATH_KEY: [temp_path_output],
            standard_component_specs.UPDATED_ANALYZER_CACHE_KEY:
            [self._updated_analyzer_cache_artifact],
            standard_component_specs.PRE_TRANSFORM_STATS_KEY:
            [self._pre_transform_stats],
            standard_component_specs.PRE_TRANSFORM_SCHEMA_KEY:
            [self._pre_transform_schema],
            standard_component_specs.POST_TRANSFORM_ANOMALIES_KEY:
            [self._post_transform_anomalies],
            standard_component_specs.POST_TRANSFORM_STATS_KEY:
            [self._post_transform_stats],
            standard_component_specs.POST_TRANSFORM_SCHEMA_KEY:
            [self._post_transform_schema],
        }

        # Create exec properties skeleton.
        self._exec_properties = {}