def testDoOutputSplit(self): # Create exec proterties. exec_properties = { 'input_config': json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='single', pattern='single/*'), ]), preserving_proto_field_name=True), 'output_config': json_format.MessageToJson( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) ]))) } # Run executor. example_gen = TestExampleGenExecutor() example_gen.Do({}, self._output_dict, exec_properties) # Check example gen outputs. self.assertTrue(tf.io.gfile.exists(self._train_output_file)) self.assertTrue(tf.io.gfile.exists(self._eval_output_file)) # Output split ratio: train:eval=2:1. self.assertGreater( tf.io.gfile.GFile(self._train_output_file).size(), tf.io.gfile.GFile(self._eval_output_file).size())
def testFeatureBasedPartition(self): # Add output config to exec proterties. self._exec_properties[ utils.OUTPUT_CONFIG_KEY] = json_format.MessageToJson( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig( splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ], partition_feature_name='i'))) self._exec_properties['has_empty'] = False # Run executor. example_gen = TestExampleGenExecutor() example_gen.Do({}, self._output_dict, self._exec_properties) # Check example gen outputs. self.assertTrue(tf.io.gfile.exists(self._train_output_file)) self.assertTrue(tf.io.gfile.exists(self._eval_output_file)) # Output split ratio: train:eval=2:1. self.assertGreater( tf.io.gfile.GFile(self._train_output_file).size(), tf.io.gfile.GFile(self._eval_output_file).size())
def setUp(self): super(BaseExampleGenExecutorTest, self).setUp() self._output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. self._examples = standard_artifacts.Examples() self._examples.uri = self._output_data_dir self._output_dict = {utils.EXAMPLES_KEY: [self._examples]} self._train_output_file = os.path.join(self._examples.uri, 'train', 'data_tfrecord-00000-of-00001.gz') self._eval_output_file = os.path.join(self._examples.uri, 'eval', 'data_tfrecord-00000-of-00001.gz') # Create exec proterties for output splits. self._exec_properties = { utils.INPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='single', pattern='single/*'), ])), utils.OUTPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) ]))) }
def Do(self, input_dict, output_dict, exec_properties): """Take input data source and generates train and eval tf examples. Args: input_dict: Input dict from input key to a list of Artifacts. Depends on detailed example gen implementation. output_dict: Output dict from output key to a list of Artifacts. - examples: train and eval split of tf examples. exec_properties: A dict of execution properties. Depends on detailed example gen implementation. - output: JSON string of example_gen_pb2.Output instance, providing output configuration. Returns: None Raises: RuntimeError: if output split config is not specified. """ self._log_startup(input_dict, output_dict, exec_properties) # Get output split information. output_config = example_gen_pb2.Output() json_format.Parse(exec_properties['output'], output_config) self._check_split_config(output_config.split_config) splits = output_config.split_config.splits # Calculate split buckets. buckets = [] total_buckets = 0 for split in splits: total_buckets += split.hash_buckets buckets.append(total_buckets) tf.logging.info('Generating examples.') with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: input_to_example = self.GetInputSourceToExamplePTransform() example_splits = ( pipeline | 'InputSourceToExample' >> input_to_example( input_dict, exec_properties) # Returns deterministic string as partition is based on it. | 'SerializeDeterministically' >> beam.Map(lambda x: x.SerializeToString(deterministic=True)) | 'SplitData' >> beam.Partition(_partition_fn, len(buckets), buckets)) # TODO(jyzhao): make shuffle optional. # pylint: disable=expression-not-assigned for index, example_split in enumerate(example_splits): (example_split | 'ShuffleSplit' + splits[index].name >> beam.transforms.Reshuffle() | 'OutputSplit' + splits[index].name >> beam.io.WriteToTFRecord(os.path.join( types.get_split_uri(output_dict['examples'], splits[index].name), DEFAULT_FILE_NAME), file_name_suffix='.gz')) # pylint: enable=expression-not-assigned tf.logging.info('Examples generated.')
def get_default_output_config(): """Default config contains 'train' and 'eval' splits with size 2:1.""" return example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig( splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))
def testDoInputSplit(self): # Create exec proterties. exec_properties = { 'input_config': json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='train', pattern='train/*'), example_gen_pb2.Input.Split(name='eval', pattern='eval/*') ]), preserving_proto_field_name=True), 'output_config': json_format.MessageToJson( example_gen_pb2.Output(), preserving_proto_field_name=True) } # Run executor. example_gen = TestExampleGenExecutor() example_gen.Do({}, self._output_dict, exec_properties) # Check example gen outputs. self.assertTrue(tf.io.gfile.exists(self._train_output_file)) self.assertTrue(tf.io.gfile.exists(self._eval_output_file)) # Input train split is bigger than eval split. self.assertGreater( tf.io.gfile.GFile(self._train_output_file).size(), tf.io.gfile.GFile(self._eval_output_file).size())
def create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig( splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=3), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ])) examples = tfrecord_input(data_root) example_gen = ImportExampleGen(input=examples, output_config=output) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) trainer = Trainer(module_file=module_file, examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=100), eval_args=trainer_pb2.EvalArgs(num_steps=50)) eval_config = tfma.EvalConfig(slicing_specs=[tfma.SlicingSpec()]) model_analyzer = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config) model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def testConstructWithOutputConfig(self): output_config = example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1), example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1) ])) example_gen = TestFileBasedExampleGenComponent( input_base='path', output_config=output_config) self.assertEqual(standard_artifacts.Examples.TYPE_NAME, example_gen.outputs['examples'].type_name) stored_output_config = example_gen_pb2.Output() json_format.Parse(example_gen.exec_properties['output_config'], stored_output_config) self.assertEqual(output_config, stored_output_config)
def testConstructWithOutputConfig(self): output_config = example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1), example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1) ])) example_gen = TestFileBasedExampleGenComponent( input_base='path', output_config=output_config) self.assertEqual( standard_artifacts.Examples.TYPE_NAME, example_gen.outputs[standard_component_specs.EXAMPLES_KEY].type_name) stored_output_config = example_gen_pb2.Output() proto_utils.json_to_proto( example_gen.exec_properties[standard_component_specs.OUTPUT_CONFIG_KEY], stored_output_config) self.assertEqual(output_config, stored_output_config)
def testMakeOutputSplitNames(self): split_names = utils.generate_output_split_names( input_config=example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train/*'), example_gen_pb2.Input.Split(name='eval', pattern='eval/*') ]), output_config=example_gen_pb2.Output()) self.assertListEqual(['train', 'eval'], split_names) split_names = utils.generate_output_split_names( input_config=example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='single', pattern='single/*') ]), output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))) self.assertListEqual(['train', 'eval'], split_names)
def make_default_output_config( input_config: Union[example_gen_pb2.Input, Dict[Text, Any]] ) -> example_gen_pb2.Output: """Returns default output config based on input config.""" if isinstance(input_config, example_gen_pb2.Input): input_config = json_format.MessageToDict( input_config, including_default_value_fields=True) if len(input_config['splits']) > 1: # Returns empty output split config as output split will be same as input. return example_gen_pb2.Output() else: # Returns 'train' and 'eval' splits with size 2:1. return example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig( splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))
def data_split(file_path): """splitting data before feeding into CsvExampleGen""" output_config = example_gen_pb2.Output( split_config = example_gen_pb2.SplitConfig(splits = [ example_gen_pb2.SplitConfig.Split(name = 'train', hash_buckets = 6), example_gen_pb2.SplitConfig.Split(name = 'eval', hash_buckets = 2), example_gen_pb2.SplitConfig.Split(name = 'test', hash_buckets = 2) ])) split_example = CsvExampleGen(input_base = file_path, output_config = output_config) return split_example
def setUp(self): super(BaseComponentWithPipelineParamTest, self).setUp() test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') example_gen_output_name = runtime_string_parameter.RuntimeStringParameter( name='example-gen-output-name', default='default-to-be-discarded') examples = standard_artifacts.ExternalArtifact() example_gen = csv_example_gen_component.CsvExampleGen( input=channel_utils.as_channel([examples]), output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name=example_gen_output_name, hash_buckets=10) ]))) statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples'], instance_name='foo') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' with dsl.Pipeline('test_pipeline'): self.example_gen = base_component.BaseComponent( component=example_gen, component_launcher_class=in_process_component_launcher. InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None) self.statistics_gen = base_component.BaseComponent( component=statistics_gen, component_launcher_class=in_process_component_launcher. InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None, ) self.tfx_example_gen = example_gen self.tfx_statistics_gen = statistics_gen
def testDo(self, mock_client): # Mock query result schema for _BigQueryConverter. mock_client.return_value.query.return_value.result.return_value.schema = self._schema output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. examples = standard_artifacts.Examples() examples.uri = output_data_dir output_dict = {'examples': [examples]} # Create exe properties. exec_properties = { 'input_config': proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='bq', pattern='SELECT i, b, f, s FROM `fake`'), ])), 'output_config': proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) ]))) } # Run executor. big_query_example_gen = executor.Executor( base_beam_executor.BaseBeamExecutor.Context( beam_pipeline_args=['--project=test-project'])) big_query_example_gen.Do({}, output_dict, exec_properties) mock_client.assert_called_with(project='test-project') self.assertEqual( artifact_utils.encode_split_names(['train', 'eval']), examples.split_names) # Check BigQuery example gen outputs. train_output_file = os.path.join(examples.uri, 'Split-train', 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(examples.uri, 'Split-eval', 'data_tfrecord-00000-of-00001.gz') self.assertTrue(fileio.exists(train_output_file)) self.assertTrue(fileio.exists(eval_output_file)) self.assertGreater( fileio.open(train_output_file).size(), fileio.open(eval_output_file).size())
def _testFeatureBasedPartition(self, partition_feature_name): self._exec_properties[utils.OUTPUT_CONFIG_KEY] = proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig( splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) ], partition_feature_name=partition_feature_name)))
def testConstructWithOutputConfig(self): big_query_example_gen = component.BigQueryExampleGen( query='query', output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1), ]))) self.assertEqual(standard_artifacts.Examples.TYPE_NAME, big_query_example_gen.outputs['examples'].type_name)
def testDo(self, mock_client): # Mock query result schema for _BigQueryConverter. mock_client.return_value.query.return_value.result.return_value.schema = self._schema output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. examples = standard_artifacts.Examples() examples.uri = output_data_dir output_dict = {'examples': [examples]} # Create exe properties. exec_properties = { 'input_config': json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='bq', pattern='SELECT i, b, f, s FROM `fake`'), ]), preserving_proto_field_name=True), 'output_config': json_format.MessageToJson( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) ])), preserving_proto_field_name=True) } # Run executor. big_query_example_gen = executor.Executor() big_query_example_gen.Do({}, output_dict, exec_properties) self.assertEqual( artifact_utils.encode_split_names(['train', 'eval']), examples.split_names) # Check BigQuery example gen outputs. train_output_file = os.path.join(examples.uri, 'train', 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(examples.uri, 'eval', 'data_tfrecord-00000-of-00001.gz') self.assertTrue(tf.io.gfile.exists(train_output_file)) self.assertTrue(tf.io.gfile.exists(eval_output_file)) self.assertGreater( tf.io.gfile.GFile(train_output_file).size(), tf.io.gfile.GFile(eval_output_file).size())
def testDo(self): output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. examples = standard_artifacts.Examples() examples.uri = output_data_dir output_dict = {utils.EXAMPLES_KEY: [examples]} # Create exec proterties. exec_properties = { utils.INPUT_BASE_KEY: self._input_data_dir, utils.INPUT_CONFIG_KEY: json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='avro', pattern='avro/*.avro'), ]), preserving_proto_field_name=True), utils.OUTPUT_CONFIG_KEY: json_format.MessageToJson( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) ])), preserving_proto_field_name=True) } # Run executor. avro_example_gen = avro_executor.Executor() avro_example_gen.Do({}, output_dict, exec_properties) self.assertEqual( artifact_utils.encode_split_names(['train', 'eval']), examples.split_names) # Check Avro example gen outputs. train_output_file = os.path.join(examples.uri, 'train', 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(examples.uri, 'eval', 'data_tfrecord-00000-of-00001.gz') self.assertTrue(fileio.exists(train_output_file)) self.assertTrue(fileio.exists(eval_output_file)) self.assertGreater( fileio.open(train_output_file).size(), fileio.open(eval_output_file).size())
def testDo(self, mock_client): # Mock query result schema for _BigQueryConverter. mock_client.return_value.query.return_value.result.return_value.schema = self._schema output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. train_examples = types.TfxArtifact(type_name='ExamplesPath', split='train') train_examples.uri = os.path.join(output_data_dir, 'train') eval_examples = types.TfxArtifact(type_name='ExamplesPath', split='eval') eval_examples.uri = os.path.join(output_data_dir, 'eval') output_dict = {'examples': [train_examples, eval_examples]} # Create exe properties. exec_properties = { 'input': json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='bq', pattern='SELECT i, f, s FROM `fake`'), ])), 'output': json_format.MessageToJson( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))) } # Run executor. big_query_example_gen = executor.Executor() big_query_example_gen.Do({}, output_dict, exec_properties) # Check BigQuery example gen outputs. train_output_file = os.path.join(train_examples.uri, 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(eval_examples.uri, 'data_tfrecord-00000-of-00001.gz') self.assertTrue(tf.gfile.Exists(train_output_file)) self.assertTrue(tf.gfile.Exists(eval_output_file)) self.assertGreater( tf.gfile.GFile(train_output_file).size(), tf.gfile.GFile(eval_output_file).size())
def testDo(self): output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. train_examples = types.TfxArtifact(type_name='ExamplesPath', split='train') train_examples.uri = os.path.join(output_data_dir, 'train') eval_examples = types.TfxArtifact(type_name='ExamplesPath', split='eval') eval_examples.uri = os.path.join(output_data_dir, 'eval') output_dict = {'examples': [train_examples, eval_examples]} # Create exe properties. exec_properties = { 'input_config': json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='bq', pattern='SELECT i, f, s FROM `fake`'), ])), 'custom_config': json_format.MessageToJson(example_gen_pb2.CustomConfig()), 'output_config': json_format.MessageToJson( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))), } # Run executor. presto_example_gen = executor.Executor() presto_example_gen.Do({}, output_dict, exec_properties) # Check Presto example gen outputs. train_output_file = os.path.join(train_examples.uri, 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(eval_examples.uri, 'data_tfrecord-00000-of-00001.gz') self.assertTrue(tf.gfile.Exists(train_output_file)) self.assertTrue(tf.gfile.Exists(eval_output_file)) self.assertGreater( tf.gfile.GFile(train_output_file).size(), tf.gfile.GFile(eval_output_file).size())
def test_construct_with_output_config(self): input_base = types.TfxArtifact(type_name='ExternalPath') example_gen = TestFileBasedExampleGenComponent( input_base=channel.as_channel([input_base]), output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1), example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1) ]))) self.assertEqual('ExamplesPath', example_gen.outputs.examples.type_name) artifact_collection = example_gen.outputs.examples.get() self.assertEqual('train', artifact_collection[0].split) self.assertEqual('eval', artifact_collection[1].split) self.assertEqual('test', artifact_collection[2].split)
def testDoInputSplit(self): # Create exec proterties for input split. self._exec_properties = { standard_component_specs.INPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train/*'), example_gen_pb2.Input.Split(name='eval', pattern='eval/*') ])), standard_component_specs.OUTPUT_CONFIG_KEY: proto_utils.proto_to_json(example_gen_pb2.Output()) } self._testDo()
def testConstructWithOutputConfig(self): input_base = standard_artifacts.ExternalArtifact() example_gen = TestFileBasedExampleGenComponent( input_base=channel_utils.as_channel([input_base]), output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1), example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1) ]))) self.assertEqual('ExamplesPath', example_gen.outputs['examples'].type_name) artifact_collection = example_gen.outputs['examples'].get() self.assertEqual('train', artifact_collection[0].split) self.assertEqual('eval', artifact_collection[1].split) self.assertEqual('test', artifact_collection[2].split)
def testDoInputSplit(self): # Create exec proterties for input split. self._exec_properties = { utils.INPUT_CONFIG_KEY: json_format.MessageToJson(example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train/*'), example_gen_pb2.Input.Split(name='eval', pattern='eval/*') ]), preserving_proto_field_name=True), utils.OUTPUT_CONFIG_KEY: json_format.MessageToJson(example_gen_pb2.Output(), preserving_proto_field_name=True) } self._testDo()
def test_construct_with_output_config(self): big_query_example_gen = component.BigQueryExampleGen( query='', output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1), example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1) ]))) self.assertEqual('ExamplesPath', big_query_example_gen.outputs.examples.type_name) artifact_collection = big_query_example_gen.outputs.examples.get() self.assertEqual('train', artifact_collection[0].split) self.assertEqual('eval', artifact_collection[1].split) self.assertEqual('test', artifact_collection[2].split)
def testDo(self): output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. examples = standard_artifacts.Examples() examples.uri = output_data_dir output_dict = {'examples': [examples]} # Create exe properties. exec_properties = { 'input_config': proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split( name='bq', pattern='SELECT i, f, s FROM `fake`'), ])), 'custom_config': proto_utils.proto_to_json(example_gen_pb2.CustomConfig()), 'output_config': proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))), } # Run executor. presto_example_gen = executor.Executor() presto_example_gen.Do({}, output_dict, exec_properties) self.assertEqual(artifact_utils.encode_split_names(['train', 'eval']), examples.split_names) # Check Presto example gen outputs. train_output_file = os.path.join(examples.uri, 'Split-train', 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(examples.uri, 'Split-eval', 'data_tfrecord-00000-of-00001.gz') self.assertTrue(fileio.exists(train_output_file)) self.assertTrue(fileio.exists(eval_output_file)) self.assertGreater( fileio.open(train_output_file).size(), fileio.open(eval_output_file).size())
def testDo(self): output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. examples = standard_artifacts.Examples() examples.uri = output_data_dir output_dict = {standard_component_specs.EXAMPLES_KEY: [examples]} # Create exec proterties. exec_properties = { standard_component_specs.INPUT_BASE_KEY: self._input_data_dir, standard_component_specs.INPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='parquet', pattern='parquet/*'), ])), standard_component_specs.OUTPUT_CONFIG_KEY: proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))) } # Run executor. parquet_example_gen = parquet_executor.Executor() parquet_example_gen.Do({}, output_dict, exec_properties) self.assertEqual(artifact_utils.encode_split_names(['train', 'eval']), examples.split_names) # Check Parquet example gen outputs. train_output_file = os.path.join(examples.uri, 'Split-train', 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(examples.uri, 'Split-eval', 'data_tfrecord-00000-of-00001.gz') self.assertTrue(fileio.exists(train_output_file)) self.assertTrue(fileio.exists(eval_output_file)) self.assertGreater( fileio.open(train_output_file).size(), fileio.open(eval_output_file).size())
def testConstructWithOutputConfig(self): big_query_example_gen = component.BigQueryExampleGen( query='query', output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1), example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1) ]))) self.assertEqual(standard_artifacts.Examples.TYPE_NAME, big_query_example_gen.outputs['examples'].type_name) artifact_collection = big_query_example_gen.outputs['examples'].get() self.assertEqual(1, len(artifact_collection)) self.assertEqual(['train', 'eval', 'test'], artifact_utils.decode_split_names( artifact_collection[0].split_names))
def setUp(self): super(ExecutorTest, self).setUp() self._input_data_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'testdata', 'external') # Create values in exec_properties self._input_config = proto_utils.proto_to_json( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='tfrecord', pattern='tfrecord/*'), ])) self._output_config = proto_utils.proto_to_json( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ])))
def testDo(self): output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create output dict. train_examples = standard_artifacts.Examples(split='train') train_examples.uri = os.path.join(output_data_dir, 'train') eval_examples = standard_artifacts.Examples(split='eval') eval_examples.uri = os.path.join(output_data_dir, 'eval') output_dict = {'examples': [train_examples, eval_examples]} # Create exec proterties. exec_properties = { 'input_config': json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='avro', pattern='avro/*.avro'), ])), 'output_config': json_format.MessageToJson( example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))) } # Run executor. avro_example_gen = avro_executor.Executor() avro_example_gen.Do(self._input_dict, output_dict, exec_properties) # Check Avro example gen outputs. train_output_file = os.path.join(train_examples.uri, 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(eval_examples.uri, 'data_tfrecord-00000-of-00001.gz') self.assertTrue(tf.gfile.Exists(train_output_file)) self.assertTrue(tf.gfile.Exists(eval_output_file)) self.assertGreater( tf.gfile.GFile(train_output_file).size(), tf.gfile.GFile(eval_output_file).size())