Esempio n. 1
0
    def testDo(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        tf.io.gfile.makedirs(output_data_dir)

        # Create input dict.
        examples = standard_artifacts.Examples()
        examples.uri = os.path.join(source_data_dir, 'csv_example_gen')
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])

        stats = standard_artifacts.ExampleStatistics()
        stats.uri = output_data_dir
        stats.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])
        input_dict = {
            executor.EXAMPLES_KEY: [examples],
        }

        output_dict = {
            executor.STATISTICS_KEY: [stats],
        }

        # Run executor.
        stats_gen_executor = executor.Executor()
        stats_gen_executor.Do(input_dict, output_dict, exec_properties={})

        # Check statistics_gen outputs.
        self._validate_stats_output(
            os.path.join(stats.uri, 'train', 'stats_tfrecord'))
        self._validate_stats_output(
            os.path.join(stats.uri, 'eval', 'stats_tfrecord'))
Esempio n. 2
0
  def testDo(self):
    source_data_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')
    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)
    tf.io.gfile.makedirs(output_data_dir)

    # Create input dict.
    train_examples = standard_artifacts.Examples(split='train')
    train_examples.uri = os.path.join(source_data_dir, 'csv_example_gen/train/')
    eval_examples = standard_artifacts.Examples(split='eval')
    eval_examples.uri = os.path.join(source_data_dir, 'csv_example_gen/eval/')

    train_stats = standard_artifacts.ExampleStatistics(split='train')
    train_stats.uri = os.path.join(output_data_dir, 'train', '')
    eval_stats = standard_artifacts.ExampleStatistics(split='eval')
    eval_stats.uri = os.path.join(output_data_dir, 'eval', '')
    input_dict = {
        'input_data': [train_examples, eval_examples],
    }

    output_dict = {
        'output': [train_stats, eval_stats],
    }

    # Run executor.
    evaluator = executor.Executor()
    evaluator.Do(input_dict, output_dict, exec_properties={})

    # Check statistics_gen outputs.
    self._validate_stats_output(os.path.join(train_stats.uri, 'stats_tfrecord'))
    self._validate_stats_output(os.path.join(eval_stats.uri, 'stats_tfrecord'))
Esempio n. 3
0
    def testDoWithSchemaAndStatsOptions(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        tf.io.gfile.makedirs(output_data_dir)

        # Create input dict.
        examples = standard_artifacts.Examples()
        examples.uri = os.path.join(source_data_dir, 'csv_example_gen')
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])

        schema = standard_artifacts.Schema()
        schema.uri = os.path.join(source_data_dir, 'schema_gen')

        input_dict = {
            executor.EXAMPLES_KEY: [examples],
            executor.SCHEMA_KEY: [schema]
        }

        exec_properties = {
            executor.STATS_OPTIONS_JSON_KEY:
            tfdv.StatsOptions(label_feature='company').to_json(),
        }

        # Create output dict.
        stats = standard_artifacts.ExampleStatistics()
        stats.uri = output_data_dir
        stats.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])
        output_dict = {
            executor.STATISTICS_KEY: [stats],
        }

        # Run executor.
        stats_gen_executor = executor.Executor()
        stats_gen_executor.Do(input_dict,
                              output_dict,
                              exec_properties=exec_properties)

        # Check statistics_gen outputs.
        self._validate_stats_output(
            os.path.join(stats.uri, 'train', 'stats_tfrecord'))
        self._validate_stats_output(
            os.path.join(stats.uri, 'eval', 'stats_tfrecord'))
Esempio n. 4
0
    def testDo(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        fileio.makedirs(output_data_dir)

        # Create input dict.
        examples = standard_artifacts.Examples()
        examples.uri = os.path.join(source_data_dir, 'csv_example_gen')
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval', 'test'])

        input_dict = {
            standard_component_specs.EXAMPLES_KEY: [examples],
        }

        exec_properties = {
            # List needs to be serialized before being passed into Do function.
            standard_component_specs.EXCLUDE_SPLITS_KEY:
            json_utils.dumps(['test']),
        }

        # Create output dict.
        stats = standard_artifacts.ExampleStatistics()
        stats.uri = output_data_dir
        output_dict = {
            standard_component_specs.STATISTICS_KEY: [stats],
        }

        # Run executor.
        stats_gen_executor = executor.Executor()
        stats_gen_executor.Do(input_dict, output_dict, exec_properties)

        self.assertEqual(artifact_utils.encode_split_names(['train', 'eval']),
                         stats.split_names)

        # Check statistics_gen outputs.
        self._validate_stats_output(
            os.path.join(stats.uri, 'train', 'stats_tfrecord'))
        self._validate_stats_output(
            os.path.join(stats.uri, 'eval', 'stats_tfrecord'))

        # Assert 'test' split is excluded.
        self.assertFalse(
            fileio.exists(os.path.join(stats.uri, 'test', 'stats_tfrecord')))
Esempio n. 5
0
    def testDoWithTwoSchemas(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        tf.io.gfile.makedirs(output_data_dir)

        # Create input dict.
        examples = standard_artifacts.Examples()
        examples.uri = os.path.join(source_data_dir, 'csv_example_gen')
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])

        schema = standard_artifacts.Schema()
        schema.uri = os.path.join(source_data_dir, 'schema_gen')

        input_dict = {
            executor.EXAMPLES_KEY: [examples],
            executor.SCHEMA_KEY: [schema]
        }

        exec_properties = {
            executor.STATS_OPTIONS_JSON_KEY:
            tfdv.StatsOptions(label_feature='company',
                              schema=schema_pb2.Schema()).to_json(),
            executor.EXCLUDE_SPLITS_KEY:
            json_utils.dumps([])
        }

        # Create output dict.
        stats = standard_artifacts.ExampleStatistics()
        stats.uri = output_data_dir
        stats.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])
        output_dict = {
            executor.STATISTICS_KEY: [stats],
        }

        # Run executor.
        stats_gen_executor = executor.Executor()
        with self.assertRaises(ValueError):
            stats_gen_executor.Do(input_dict, output_dict, exec_properties)