def testDoOutputSplit(self):
    # Create exec proterties.
    exec_properties = {
        'input_config':
            json_format.MessageToJson(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(
                        name='single', pattern='single/*'),
                ]),
                preserving_proto_field_name=True),
        'output_config':
            json_format.MessageToJson(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(
                            name='train', hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(
                            name='eval', hash_buckets=1)
                    ])))
    }

    # Run executor.
    example_gen = TestExampleGenExecutor()
    example_gen.Do({}, self._output_dict, exec_properties)

    # Check example gen outputs.
    self.assertTrue(tf.io.gfile.exists(self._train_output_file))
    self.assertTrue(tf.io.gfile.exists(self._eval_output_file))
    # Output split ratio: train:eval=2:1.
    self.assertGreater(
        tf.io.gfile.GFile(self._train_output_file).size(),
        tf.io.gfile.GFile(self._eval_output_file).size())
Exemple #2
0
    def testFeatureBasedPartition(self):
        # Add output config to exec proterties.
        self._exec_properties[
            utils.OUTPUT_CONFIG_KEY] = json_format.MessageToJson(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(
                        splits=[
                            example_gen_pb2.SplitConfig.Split(name='train',
                                                              hash_buckets=2),
                            example_gen_pb2.SplitConfig.Split(name='eval',
                                                              hash_buckets=1)
                        ],
                        partition_feature_name='i')))
        self._exec_properties['has_empty'] = False

        # Run executor.
        example_gen = TestExampleGenExecutor()
        example_gen.Do({}, self._output_dict, self._exec_properties)

        # Check example gen outputs.
        self.assertTrue(tf.io.gfile.exists(self._train_output_file))
        self.assertTrue(tf.io.gfile.exists(self._eval_output_file))

        # Output split ratio: train:eval=2:1.
        self.assertGreater(
            tf.io.gfile.GFile(self._train_output_file).size(),
            tf.io.gfile.GFile(self._eval_output_file).size())
  def setUp(self):
    super(BaseExampleGenExecutorTest, self).setUp()
    self._output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    # Create output dict.
    self._examples = standard_artifacts.Examples()
    self._examples.uri = self._output_data_dir
    self._output_dict = {utils.EXAMPLES_KEY: [self._examples]}

    self._train_output_file = os.path.join(self._examples.uri, 'train',
                                           'data_tfrecord-00000-of-00001.gz')
    self._eval_output_file = os.path.join(self._examples.uri, 'eval',
                                          'data_tfrecord-00000-of-00001.gz')

    # Create exec proterties for output splits.
    self._exec_properties = {
        utils.INPUT_CONFIG_KEY:
            proto_utils.proto_to_json(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(
                        name='single', pattern='single/*'),
                ])),
        utils.OUTPUT_CONFIG_KEY:
            proto_utils.proto_to_json(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(
                            name='train', hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(
                            name='eval', hash_buckets=1)
                    ])))
    }
    def Do(self, input_dict, output_dict, exec_properties):
        """Take input data source and generates train and eval tf examples.

    Args:
      input_dict: Input dict from input key to a list of Artifacts. Depends on
        detailed example gen implementation.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: train and eval split of tf examples.
      exec_properties: A dict of execution properties. Depends on detailed
        example gen implementation.
        - output: JSON string of example_gen_pb2.Output instance, providing
          output configuration.

    Returns:
      None

    Raises:
      RuntimeError: if output split config is not specified.
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # Get output split information.
        output_config = example_gen_pb2.Output()
        json_format.Parse(exec_properties['output'], output_config)
        self._check_split_config(output_config.split_config)
        splits = output_config.split_config.splits
        # Calculate split buckets.
        buckets = []
        total_buckets = 0
        for split in splits:
            total_buckets += split.hash_buckets
            buckets.append(total_buckets)

        tf.logging.info('Generating examples.')
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
            input_to_example = self.GetInputSourceToExamplePTransform()
            example_splits = (
                pipeline
                | 'InputSourceToExample' >> input_to_example(
                    input_dict, exec_properties)
                # Returns deterministic string as partition is based on it.
                | 'SerializeDeterministically' >>
                beam.Map(lambda x: x.SerializeToString(deterministic=True))
                | 'SplitData' >> beam.Partition(_partition_fn, len(buckets),
                                                buckets))
            # TODO(jyzhao): make shuffle optional.
            # pylint: disable=expression-not-assigned
            for index, example_split in enumerate(example_splits):
                (example_split
                 | 'ShuffleSplit' + splits[index].name >>
                 beam.transforms.Reshuffle()
                 | 'OutputSplit' + splits[index].name >>
                 beam.io.WriteToTFRecord(os.path.join(
                     types.get_split_uri(output_dict['examples'],
                                         splits[index].name),
                     DEFAULT_FILE_NAME),
                                         file_name_suffix='.gz'))
            # pylint: enable=expression-not-assigned

        tf.logging.info('Examples generated.')
Exemple #5
0
def get_default_output_config():
    """Default config contains 'train' and 'eval' splits with size 2:1."""
    return example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(
        splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ]))
  def testDoInputSplit(self):
    # Create exec proterties.
    exec_properties = {
        'input_config':
            json_format.MessageToJson(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(
                        name='train', pattern='train/*'),
                    example_gen_pb2.Input.Split(name='eval', pattern='eval/*')
                ]),
                preserving_proto_field_name=True),
        'output_config':
            json_format.MessageToJson(
                example_gen_pb2.Output(), preserving_proto_field_name=True)
    }

    # Run executor.
    example_gen = TestExampleGenExecutor()
    example_gen.Do({}, self._output_dict, exec_properties)

    # Check example gen outputs.
    self.assertTrue(tf.io.gfile.exists(self._train_output_file))
    self.assertTrue(tf.io.gfile.exists(self._eval_output_file))
    # Input train split is bigger than eval split.
    self.assertGreater(
        tf.io.gfile.GFile(self._train_output_file).size(),
        tf.io.gfile.GFile(self._eval_output_file).size())
Exemple #7
0
def create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                    module_file: Text, serving_model_dir: Text,
                    metadata_path: Text,
                    direct_num_workers: int) -> pipeline.Pipeline:
    output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(
        splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=3),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ]))
    examples = tfrecord_input(data_root)
    example_gen = ImportExampleGen(input=examples, output_config=output)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=True)
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)
    trainer = Trainer(module_file=module_file,
                      examples=transform.outputs['transformed_examples'],
                      transform_graph=transform.outputs['transform_graph'],
                      schema=infer_schema.outputs['schema'],
                      train_args=trainer_pb2.TrainArgs(num_steps=100),
                      eval_args=trainer_pb2.EvalArgs(num_steps=50))

    eval_config = tfma.EvalConfig(slicing_specs=[tfma.SlicingSpec()])

    model_analyzer = Evaluator(examples=example_gen.outputs['examples'],
                               model=trainer.outputs['model'],
                               eval_config=eval_config)

    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_analyzer.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            infer_schema,
            validate_stats,
            transform,
            trainer,
            model_analyzer,
            model_validator,
            pusher,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
Exemple #8
0
    def testConstructWithOutputConfig(self):
        output_config = example_gen_pb2.Output(
            split_config=example_gen_pb2.SplitConfig(splits=[
                example_gen_pb2.SplitConfig.Split(name='train',
                                                  hash_buckets=2),
                example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1),
                example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1)
            ]))
        example_gen = TestFileBasedExampleGenComponent(
            input_base='path', output_config=output_config)
        self.assertEqual(standard_artifacts.Examples.TYPE_NAME,
                         example_gen.outputs['examples'].type_name)

        stored_output_config = example_gen_pb2.Output()
        json_format.Parse(example_gen.exec_properties['output_config'],
                          stored_output_config)
        self.assertEqual(output_config, stored_output_config)
Exemple #9
0
  def testConstructWithOutputConfig(self):
    output_config = example_gen_pb2.Output(
        split_config=example_gen_pb2.SplitConfig(splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1),
            example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1)
        ]))
    example_gen = TestFileBasedExampleGenComponent(
        input_base='path', output_config=output_config)
    self.assertEqual(
        standard_artifacts.Examples.TYPE_NAME,
        example_gen.outputs[standard_component_specs.EXAMPLES_KEY].type_name)

    stored_output_config = example_gen_pb2.Output()
    proto_utils.json_to_proto(
        example_gen.exec_properties[standard_component_specs.OUTPUT_CONFIG_KEY],
        stored_output_config)
    self.assertEqual(output_config, stored_output_config)
Exemple #10
0
  def testMakeOutputSplitNames(self):
    split_names = utils.generate_output_split_names(
        input_config=example_gen_pb2.Input(splits=[
            example_gen_pb2.Input.Split(name='train', pattern='train/*'),
            example_gen_pb2.Input.Split(name='eval', pattern='eval/*')
        ]),
        output_config=example_gen_pb2.Output())
    self.assertListEqual(['train', 'eval'], split_names)

    split_names = utils.generate_output_split_names(
        input_config=example_gen_pb2.Input(splits=[
            example_gen_pb2.Input.Split(name='single', pattern='single/*')
        ]),
        output_config=example_gen_pb2.Output(
            split_config=example_gen_pb2.SplitConfig(splits=[
                example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2),
                example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
            ])))
    self.assertListEqual(['train', 'eval'], split_names)
Exemple #11
0
def make_default_output_config(
    input_config: Union[example_gen_pb2.Input, Dict[Text, Any]]
) -> example_gen_pb2.Output:
    """Returns default output config based on input config."""
    if isinstance(input_config, example_gen_pb2.Input):
        input_config = json_format.MessageToDict(
            input_config, including_default_value_fields=True)

    if len(input_config['splits']) > 1:
        # Returns empty output split config as output split will be same as input.
        return example_gen_pb2.Output()
    else:
        # Returns 'train' and 'eval' splits with size 2:1.
        return example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(
            splits=[
                example_gen_pb2.SplitConfig.Split(name='train',
                                                  hash_buckets=2),
                example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
            ]))
def data_split(file_path):
    """splitting data before feeding into CsvExampleGen"""
    output_config = example_gen_pb2.Output(
        split_config = example_gen_pb2.SplitConfig(splits = [
            example_gen_pb2.SplitConfig.Split(name = 'train', hash_buckets = 6),
            example_gen_pb2.SplitConfig.Split(name = 'eval', hash_buckets = 2),
            example_gen_pb2.SplitConfig.Split(name = 'test', hash_buckets = 2)
        ]))
    split_example = CsvExampleGen(input_base = file_path, output_config = output_config)
    return split_example
Exemple #13
0
    def setUp(self):
        super(BaseComponentWithPipelineParamTest, self).setUp()

        test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')
        example_gen_output_name = runtime_string_parameter.RuntimeStringParameter(
            name='example-gen-output-name', default='default-to-be-discarded')

        examples = standard_artifacts.ExternalArtifact()
        example_gen = csv_example_gen_component.CsvExampleGen(
            input=channel_utils.as_channel([examples]),
            output_config=example_gen_pb2.Output(
                split_config=example_gen_pb2.SplitConfig(splits=[
                    example_gen_pb2.SplitConfig.Split(
                        name=example_gen_output_name, hash_buckets=10)
                ])))
        statistics_gen = statistics_gen_component.StatisticsGen(
            examples=example_gen.outputs['examples'], instance_name='foo')

        pipeline = tfx_pipeline.Pipeline(
            pipeline_name=self._test_pipeline_name,
            pipeline_root='test_pipeline_root',
            metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
            components=[example_gen, statistics_gen],
        )

        self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
        self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
        with dsl.Pipeline('test_pipeline'):
            self.example_gen = base_component.BaseComponent(
                component=example_gen,
                component_launcher_class=in_process_component_launcher.
                InProcessComponentLauncher,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_name=self._test_pipeline_name,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                component_config=None)
            self.statistics_gen = base_component.BaseComponent(
                component=statistics_gen,
                component_launcher_class=in_process_component_launcher.
                InProcessComponentLauncher,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_name=self._test_pipeline_name,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                component_config=None,
            )

        self.tfx_example_gen = example_gen
        self.tfx_statistics_gen = statistics_gen
Exemple #14
0
  def testDo(self, mock_client):
    # Mock query result schema for _BigQueryConverter.
    mock_client.return_value.query.return_value.result.return_value.schema = self._schema

    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    # Create output dict.
    examples = standard_artifacts.Examples()
    examples.uri = output_data_dir
    output_dict = {'examples': [examples]}

    # Create exe properties.
    exec_properties = {
        'input_config':
            proto_utils.proto_to_json(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(
                        name='bq', pattern='SELECT i, b, f, s FROM `fake`'),
                ])),
        'output_config':
            proto_utils.proto_to_json(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(
                            name='train', hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(
                            name='eval', hash_buckets=1)
                    ])))
    }

    # Run executor.
    big_query_example_gen = executor.Executor(
        base_beam_executor.BaseBeamExecutor.Context(
            beam_pipeline_args=['--project=test-project']))
    big_query_example_gen.Do({}, output_dict, exec_properties)

    mock_client.assert_called_with(project='test-project')

    self.assertEqual(
        artifact_utils.encode_split_names(['train', 'eval']),
        examples.split_names)

    # Check BigQuery example gen outputs.
    train_output_file = os.path.join(examples.uri, 'Split-train',
                                     'data_tfrecord-00000-of-00001.gz')
    eval_output_file = os.path.join(examples.uri, 'Split-eval',
                                    'data_tfrecord-00000-of-00001.gz')
    self.assertTrue(fileio.exists(train_output_file))
    self.assertTrue(fileio.exists(eval_output_file))
    self.assertGreater(
        fileio.open(train_output_file).size(),
        fileio.open(eval_output_file).size())
 def _testFeatureBasedPartition(self, partition_feature_name):
   self._exec_properties[utils.OUTPUT_CONFIG_KEY] = proto_utils.proto_to_json(
       example_gen_pb2.Output(
           split_config=example_gen_pb2.SplitConfig(
               splits=[
                   example_gen_pb2.SplitConfig.Split(
                       name='train', hash_buckets=2),
                   example_gen_pb2.SplitConfig.Split(
                       name='eval', hash_buckets=1)
               ],
               partition_feature_name=partition_feature_name)))
Exemple #16
0
 def testConstructWithOutputConfig(self):
     big_query_example_gen = component.BigQueryExampleGen(
         query='query',
         output_config=example_gen_pb2.Output(
             split_config=example_gen_pb2.SplitConfig(splits=[
                 example_gen_pb2.SplitConfig.Split(name='train',
                                                   hash_buckets=2),
                 example_gen_pb2.SplitConfig.Split(name='eval',
                                                   hash_buckets=1),
             ])))
     self.assertEqual(standard_artifacts.Examples.TYPE_NAME,
                      big_query_example_gen.outputs['examples'].type_name)
Exemple #17
0
  def testDo(self, mock_client):
    # Mock query result schema for _BigQueryConverter.
    mock_client.return_value.query.return_value.result.return_value.schema = self._schema

    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    # Create output dict.
    examples = standard_artifacts.Examples()
    examples.uri = output_data_dir
    output_dict = {'examples': [examples]}

    # Create exe properties.
    exec_properties = {
        'input_config':
            json_format.MessageToJson(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(
                        name='bq', pattern='SELECT i, b, f, s FROM `fake`'),
                ]),
                preserving_proto_field_name=True),
        'output_config':
            json_format.MessageToJson(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(
                            name='train', hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(
                            name='eval', hash_buckets=1)
                    ])),
                preserving_proto_field_name=True)
    }

    # Run executor.
    big_query_example_gen = executor.Executor()
    big_query_example_gen.Do({}, output_dict, exec_properties)

    self.assertEqual(
        artifact_utils.encode_split_names(['train', 'eval']),
        examples.split_names)

    # Check BigQuery example gen outputs.
    train_output_file = os.path.join(examples.uri, 'train',
                                     'data_tfrecord-00000-of-00001.gz')
    eval_output_file = os.path.join(examples.uri, 'eval',
                                    'data_tfrecord-00000-of-00001.gz')
    self.assertTrue(tf.io.gfile.exists(train_output_file))
    self.assertTrue(tf.io.gfile.exists(eval_output_file))
    self.assertGreater(
        tf.io.gfile.GFile(train_output_file).size(),
        tf.io.gfile.GFile(eval_output_file).size())
Exemple #18
0
  def testDo(self):
    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    # Create output dict.
    examples = standard_artifacts.Examples()
    examples.uri = output_data_dir
    output_dict = {utils.EXAMPLES_KEY: [examples]}

    # Create exec proterties.
    exec_properties = {
        utils.INPUT_BASE_KEY:
            self._input_data_dir,
        utils.INPUT_CONFIG_KEY:
            json_format.MessageToJson(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(
                        name='avro', pattern='avro/*.avro'),
                ]),
                preserving_proto_field_name=True),
        utils.OUTPUT_CONFIG_KEY:
            json_format.MessageToJson(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(
                            name='train', hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(
                            name='eval', hash_buckets=1)
                    ])),
                preserving_proto_field_name=True)
    }

    # Run executor.
    avro_example_gen = avro_executor.Executor()
    avro_example_gen.Do({}, output_dict, exec_properties)

    self.assertEqual(
        artifact_utils.encode_split_names(['train', 'eval']),
        examples.split_names)

    # Check Avro example gen outputs.
    train_output_file = os.path.join(examples.uri, 'train',
                                     'data_tfrecord-00000-of-00001.gz')
    eval_output_file = os.path.join(examples.uri, 'eval',
                                    'data_tfrecord-00000-of-00001.gz')
    self.assertTrue(fileio.exists(train_output_file))
    self.assertTrue(fileio.exists(eval_output_file))
    self.assertGreater(
        fileio.open(train_output_file).size(),
        fileio.open(eval_output_file).size())
Exemple #19
0
    def testDo(self, mock_client):
        # Mock query result schema for _BigQueryConverter.
        mock_client.return_value.query.return_value.result.return_value.schema = self._schema

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        # Create output dict.
        train_examples = types.TfxArtifact(type_name='ExamplesPath',
                                           split='train')
        train_examples.uri = os.path.join(output_data_dir, 'train')
        eval_examples = types.TfxArtifact(type_name='ExamplesPath',
                                          split='eval')
        eval_examples.uri = os.path.join(output_data_dir, 'eval')
        output_dict = {'examples': [train_examples, eval_examples]}

        # Create exe properties.
        exec_properties = {
            'input':
            json_format.MessageToJson(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(
                        name='bq', pattern='SELECT i, f, s FROM `fake`'),
                ])),
            'output':
            json_format.MessageToJson(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(name='train',
                                                          hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(name='eval',
                                                          hash_buckets=1)
                    ])))
        }

        # Run executor.
        big_query_example_gen = executor.Executor()
        big_query_example_gen.Do({}, output_dict, exec_properties)

        # Check BigQuery example gen outputs.
        train_output_file = os.path.join(train_examples.uri,
                                         'data_tfrecord-00000-of-00001.gz')
        eval_output_file = os.path.join(eval_examples.uri,
                                        'data_tfrecord-00000-of-00001.gz')
        self.assertTrue(tf.gfile.Exists(train_output_file))
        self.assertTrue(tf.gfile.Exists(eval_output_file))
        self.assertGreater(
            tf.gfile.GFile(train_output_file).size(),
            tf.gfile.GFile(eval_output_file).size())
Exemple #20
0
    def testDo(self):
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        # Create output dict.
        train_examples = types.TfxArtifact(type_name='ExamplesPath',
                                           split='train')
        train_examples.uri = os.path.join(output_data_dir, 'train')
        eval_examples = types.TfxArtifact(type_name='ExamplesPath',
                                          split='eval')
        eval_examples.uri = os.path.join(output_data_dir, 'eval')
        output_dict = {'examples': [train_examples, eval_examples]}

        # Create exe properties.
        exec_properties = {
            'input_config':
            json_format.MessageToJson(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(
                        name='bq', pattern='SELECT i, f, s FROM `fake`'),
                ])),
            'custom_config':
            json_format.MessageToJson(example_gen_pb2.CustomConfig()),
            'output_config':
            json_format.MessageToJson(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(name='train',
                                                          hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(name='eval',
                                                          hash_buckets=1)
                    ]))),
        }

        # Run executor.
        presto_example_gen = executor.Executor()
        presto_example_gen.Do({}, output_dict, exec_properties)

        # Check Presto example gen outputs.
        train_output_file = os.path.join(train_examples.uri,
                                         'data_tfrecord-00000-of-00001.gz')
        eval_output_file = os.path.join(eval_examples.uri,
                                        'data_tfrecord-00000-of-00001.gz')
        self.assertTrue(tf.gfile.Exists(train_output_file))
        self.assertTrue(tf.gfile.Exists(eval_output_file))
        self.assertGreater(
            tf.gfile.GFile(train_output_file).size(),
            tf.gfile.GFile(eval_output_file).size())
Exemple #21
0
 def test_construct_with_output_config(self):
   input_base = types.TfxArtifact(type_name='ExternalPath')
   example_gen = TestFileBasedExampleGenComponent(
       input_base=channel.as_channel([input_base]),
       output_config=example_gen_pb2.Output(
           split_config=example_gen_pb2.SplitConfig(splits=[
               example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2),
               example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1),
               example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1)
           ])))
   self.assertEqual('ExamplesPath', example_gen.outputs.examples.type_name)
   artifact_collection = example_gen.outputs.examples.get()
   self.assertEqual('train', artifact_collection[0].split)
   self.assertEqual('eval', artifact_collection[1].split)
   self.assertEqual('test', artifact_collection[2].split)
    def testDoInputSplit(self):
        # Create exec proterties for input split.
        self._exec_properties = {
            standard_component_specs.INPUT_CONFIG_KEY:
            proto_utils.proto_to_json(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(name='train',
                                                pattern='train/*'),
                    example_gen_pb2.Input.Split(name='eval', pattern='eval/*')
                ])),
            standard_component_specs.OUTPUT_CONFIG_KEY:
            proto_utils.proto_to_json(example_gen_pb2.Output())
        }

        self._testDo()
Exemple #23
0
 def testConstructWithOutputConfig(self):
   input_base = standard_artifacts.ExternalArtifact()
   example_gen = TestFileBasedExampleGenComponent(
       input_base=channel_utils.as_channel([input_base]),
       output_config=example_gen_pb2.Output(
           split_config=example_gen_pb2.SplitConfig(splits=[
               example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2),
               example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1),
               example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1)
           ])))
   self.assertEqual('ExamplesPath', example_gen.outputs['examples'].type_name)
   artifact_collection = example_gen.outputs['examples'].get()
   self.assertEqual('train', artifact_collection[0].split)
   self.assertEqual('eval', artifact_collection[1].split)
   self.assertEqual('test', artifact_collection[2].split)
Exemple #24
0
    def testDoInputSplit(self):
        # Create exec proterties for input split.
        self._exec_properties = {
            utils.INPUT_CONFIG_KEY:
            json_format.MessageToJson(example_gen_pb2.Input(splits=[
                example_gen_pb2.Input.Split(name='train', pattern='train/*'),
                example_gen_pb2.Input.Split(name='eval', pattern='eval/*')
            ]),
                                      preserving_proto_field_name=True),
            utils.OUTPUT_CONFIG_KEY:
            json_format.MessageToJson(example_gen_pb2.Output(),
                                      preserving_proto_field_name=True)
        }

        self._testDo()
Exemple #25
0
 def test_construct_with_output_config(self):
   big_query_example_gen = component.BigQueryExampleGen(
       query='',
       output_config=example_gen_pb2.Output(
           split_config=example_gen_pb2.SplitConfig(splits=[
               example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2),
               example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1),
               example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1)
           ])))
   self.assertEqual('ExamplesPath',
                    big_query_example_gen.outputs.examples.type_name)
   artifact_collection = big_query_example_gen.outputs.examples.get()
   self.assertEqual('train', artifact_collection[0].split)
   self.assertEqual('eval', artifact_collection[1].split)
   self.assertEqual('test', artifact_collection[2].split)
Exemple #26
0
    def testDo(self):
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        # Create output dict.
        examples = standard_artifacts.Examples()
        examples.uri = output_data_dir
        output_dict = {'examples': [examples]}

        # Create exe properties.
        exec_properties = {
            'input_config':
            proto_utils.proto_to_json(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(
                        name='bq', pattern='SELECT i, f, s FROM `fake`'),
                ])),
            'custom_config':
            proto_utils.proto_to_json(example_gen_pb2.CustomConfig()),
            'output_config':
            proto_utils.proto_to_json(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(name='train',
                                                          hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(name='eval',
                                                          hash_buckets=1)
                    ]))),
        }

        # Run executor.
        presto_example_gen = executor.Executor()
        presto_example_gen.Do({}, output_dict, exec_properties)

        self.assertEqual(artifact_utils.encode_split_names(['train', 'eval']),
                         examples.split_names)

        # Check Presto example gen outputs.
        train_output_file = os.path.join(examples.uri, 'Split-train',
                                         'data_tfrecord-00000-of-00001.gz')
        eval_output_file = os.path.join(examples.uri, 'Split-eval',
                                        'data_tfrecord-00000-of-00001.gz')
        self.assertTrue(fileio.exists(train_output_file))
        self.assertTrue(fileio.exists(eval_output_file))
        self.assertGreater(
            fileio.open(train_output_file).size(),
            fileio.open(eval_output_file).size())
Exemple #27
0
    def testDo(self):
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        # Create output dict.
        examples = standard_artifacts.Examples()
        examples.uri = output_data_dir
        output_dict = {standard_component_specs.EXAMPLES_KEY: [examples]}

        # Create exec proterties.
        exec_properties = {
            standard_component_specs.INPUT_BASE_KEY:
            self._input_data_dir,
            standard_component_specs.INPUT_CONFIG_KEY:
            proto_utils.proto_to_json(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(name='parquet',
                                                pattern='parquet/*'),
                ])),
            standard_component_specs.OUTPUT_CONFIG_KEY:
            proto_utils.proto_to_json(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(name='train',
                                                          hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(name='eval',
                                                          hash_buckets=1)
                    ])))
        }

        # Run executor.
        parquet_example_gen = parquet_executor.Executor()
        parquet_example_gen.Do({}, output_dict, exec_properties)

        self.assertEqual(artifact_utils.encode_split_names(['train', 'eval']),
                         examples.split_names)

        # Check Parquet example gen outputs.
        train_output_file = os.path.join(examples.uri, 'Split-train',
                                         'data_tfrecord-00000-of-00001.gz')
        eval_output_file = os.path.join(examples.uri, 'Split-eval',
                                        'data_tfrecord-00000-of-00001.gz')
        self.assertTrue(fileio.exists(train_output_file))
        self.assertTrue(fileio.exists(eval_output_file))
        self.assertGreater(
            fileio.open(train_output_file).size(),
            fileio.open(eval_output_file).size())
Exemple #28
0
 def testConstructWithOutputConfig(self):
   big_query_example_gen = component.BigQueryExampleGen(
       query='query',
       output_config=example_gen_pb2.Output(
           split_config=example_gen_pb2.SplitConfig(splits=[
               example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2),
               example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1),
               example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1)
           ])))
   self.assertEqual(standard_artifacts.Examples.TYPE_NAME,
                    big_query_example_gen.outputs['examples'].type_name)
   artifact_collection = big_query_example_gen.outputs['examples'].get()
   self.assertEqual(1, len(artifact_collection))
   self.assertEqual(['train', 'eval', 'test'],
                    artifact_utils.decode_split_names(
                        artifact_collection[0].split_names))
Exemple #29
0
  def setUp(self):
    super(ExecutorTest, self).setUp()
    self._input_data_dir = os.path.join(
        os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'testdata',
        'external')

    # Create values in exec_properties
    self._input_config = proto_utils.proto_to_json(
        example_gen_pb2.Input(splits=[
            example_gen_pb2.Input.Split(name='tfrecord', pattern='tfrecord/*'),
        ]))
    self._output_config = proto_utils.proto_to_json(
        example_gen_pb2.Output(
            split_config=example_gen_pb2.SplitConfig(splits=[
                example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2),
                example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
            ])))
Exemple #30
0
    def testDo(self):
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        # Create output dict.
        train_examples = standard_artifacts.Examples(split='train')
        train_examples.uri = os.path.join(output_data_dir, 'train')
        eval_examples = standard_artifacts.Examples(split='eval')
        eval_examples.uri = os.path.join(output_data_dir, 'eval')
        output_dict = {'examples': [train_examples, eval_examples]}

        # Create exec proterties.
        exec_properties = {
            'input_config':
            json_format.MessageToJson(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(name='avro',
                                                pattern='avro/*.avro'),
                ])),
            'output_config':
            json_format.MessageToJson(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(name='train',
                                                          hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(name='eval',
                                                          hash_buckets=1)
                    ])))
        }

        # Run executor.
        avro_example_gen = avro_executor.Executor()
        avro_example_gen.Do(self._input_dict, output_dict, exec_properties)

        # Check Avro example gen outputs.
        train_output_file = os.path.join(train_examples.uri,
                                         'data_tfrecord-00000-of-00001.gz')
        eval_output_file = os.path.join(eval_examples.uri,
                                        'data_tfrecord-00000-of-00001.gz')
        self.assertTrue(tf.gfile.Exists(train_output_file))
        self.assertTrue(tf.gfile.Exists(eval_output_file))
        self.assertGreater(
            tf.gfile.GFile(train_output_file).size(),
            tf.gfile.GFile(eval_output_file).size())