def testDo(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) tf.io.gfile.makedirs(output_data_dir) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) stats = standard_artifacts.ExampleStatistics() stats.uri = output_data_dir stats.split_names = artifact_utils.encode_split_names( ['train', 'eval']) input_dict = { executor.EXAMPLES_KEY: [examples], } output_dict = { executor.STATISTICS_KEY: [stats], } # Run executor. stats_gen_executor = executor.Executor() stats_gen_executor.Do(input_dict, output_dict, exec_properties={}) # Check statistics_gen outputs. self._validate_stats_output( os.path.join(stats.uri, 'train', 'stats_tfrecord')) self._validate_stats_output( os.path.join(stats.uri, 'eval', 'stats_tfrecord'))
def testDo(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) tf.io.gfile.makedirs(output_data_dir) # Create input dict. train_examples = standard_artifacts.Examples(split='train') train_examples.uri = os.path.join(source_data_dir, 'csv_example_gen/train/') eval_examples = standard_artifacts.Examples(split='eval') eval_examples.uri = os.path.join(source_data_dir, 'csv_example_gen/eval/') train_stats = standard_artifacts.ExampleStatistics(split='train') train_stats.uri = os.path.join(output_data_dir, 'train', '') eval_stats = standard_artifacts.ExampleStatistics(split='eval') eval_stats.uri = os.path.join(output_data_dir, 'eval', '') input_dict = { 'input_data': [train_examples, eval_examples], } output_dict = { 'output': [train_stats, eval_stats], } # Run executor. evaluator = executor.Executor() evaluator.Do(input_dict, output_dict, exec_properties={}) # Check statistics_gen outputs. self._validate_stats_output(os.path.join(train_stats.uri, 'stats_tfrecord')) self._validate_stats_output(os.path.join(eval_stats.uri, 'stats_tfrecord'))
def testDoWithSchemaAndStatsOptions(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) tf.io.gfile.makedirs(output_data_dir) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) schema = standard_artifacts.Schema() schema.uri = os.path.join(source_data_dir, 'schema_gen') input_dict = { executor.EXAMPLES_KEY: [examples], executor.SCHEMA_KEY: [schema] } exec_properties = { executor.STATS_OPTIONS_JSON_KEY: tfdv.StatsOptions(label_feature='company').to_json(), } # Create output dict. stats = standard_artifacts.ExampleStatistics() stats.uri = output_data_dir stats.split_names = artifact_utils.encode_split_names( ['train', 'eval']) output_dict = { executor.STATISTICS_KEY: [stats], } # Run executor. stats_gen_executor = executor.Executor() stats_gen_executor.Do(input_dict, output_dict, exec_properties=exec_properties) # Check statistics_gen outputs. self._validate_stats_output( os.path.join(stats.uri, 'train', 'stats_tfrecord')) self._validate_stats_output( os.path.join(stats.uri, 'eval', 'stats_tfrecord'))
def testDo(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) fileio.makedirs(output_data_dir) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval', 'test']) input_dict = { standard_component_specs.EXAMPLES_KEY: [examples], } exec_properties = { # List needs to be serialized before being passed into Do function. standard_component_specs.EXCLUDE_SPLITS_KEY: json_utils.dumps(['test']), } # Create output dict. stats = standard_artifacts.ExampleStatistics() stats.uri = output_data_dir output_dict = { standard_component_specs.STATISTICS_KEY: [stats], } # Run executor. stats_gen_executor = executor.Executor() stats_gen_executor.Do(input_dict, output_dict, exec_properties) self.assertEqual(artifact_utils.encode_split_names(['train', 'eval']), stats.split_names) # Check statistics_gen outputs. self._validate_stats_output( os.path.join(stats.uri, 'train', 'stats_tfrecord')) self._validate_stats_output( os.path.join(stats.uri, 'eval', 'stats_tfrecord')) # Assert 'test' split is excluded. self.assertFalse( fileio.exists(os.path.join(stats.uri, 'test', 'stats_tfrecord')))
def testDoWithTwoSchemas(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) tf.io.gfile.makedirs(output_data_dir) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) schema = standard_artifacts.Schema() schema.uri = os.path.join(source_data_dir, 'schema_gen') input_dict = { executor.EXAMPLES_KEY: [examples], executor.SCHEMA_KEY: [schema] } exec_properties = { executor.STATS_OPTIONS_JSON_KEY: tfdv.StatsOptions(label_feature='company', schema=schema_pb2.Schema()).to_json(), executor.EXCLUDE_SPLITS_KEY: json_utils.dumps([]) } # Create output dict. stats = standard_artifacts.ExampleStatistics() stats.uri = output_data_dir stats.split_names = artifact_utils.encode_split_names( ['train', 'eval']) output_dict = { executor.STATISTICS_KEY: [stats], } # Run executor. stats_gen_executor = executor.Executor() with self.assertRaises(ValueError): stats_gen_executor.Do(input_dict, output_dict, exec_properties)