Ejemplo n.º 1
0
    def __init__(self,
                 dataset_dir: types.Channel,
                 checkpoint_dir: types.Channel,
                 encoding_dir: types.Channel,
                 model_name: Text,
                 train_config: Dict,
                 encoding: Text = 'utf-8',
                 end_token: Text = ""):
        trained_checkpoint_dir = external_input("TrainGPT2")
        sample_dir = external_input("TrainGPT2")
        tensorboard_dir = external_input("TrainGPT2")
        hyperparameter_dir = external_input("TrainGPT2")
        metric_dir = external_input("TrainGPT2")

        spec = TrainGPT2Spec(dataset_dir=dataset_dir,
                             checkpoint_dir=checkpoint_dir,
                             encoding_dir=encoding_dir,
                             model_name=model_name,
                             train_config=train_config,
                             encoding=encoding,
                             trained_checkpoint_dir=trained_checkpoint_dir,
                             sample_dir=sample_dir,
                             hyperparameter_dir=hyperparameter_dir,
                             metric_dir=metric_dir,
                             tensorboard_dir=tensorboard_dir,
                             end_token=end_token)

        super(TrainGPT2, self).__init__(spec=spec)
Ejemplo n.º 2
0
    def __init__(self, url):
        if isinstance(url, str):
            url = [url]
        rss_feed = external_input("rss_feed")
        spec = NewsCrawlerSpec(url=url, rss_feed=rss_feed)

        super(NewsCrawler, self).__init__(spec=spec)
Ejemplo n.º 3
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
  """Implements the Iris flowers pipeline with TFX."""
  examples = external_input(data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input=examples)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(
      statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=infer_schema.outputs['schema'])

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=module_file,
      examples=example_gen.outputs['examples'],
      schema=infer_schema.outputs['schema'],
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=example_gen.outputs['examples'], model=trainer.outputs['model'])

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=example_gen.outputs['examples'], model=trainer.outputs['model'])

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model=trainer.outputs['model'],
      model_blessing=model_validator.outputs['blessing'],
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=serving_model_dir)))

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          example_gen, statistics_gen, infer_schema, validate_stats, trainer,
          model_analyzer, model_validator, pusher
      ],
      enable_cache=True,
      metadata_connection_config=metadata.sqlite_metadata_connection_config(
          metadata_path),
      # TODO(b/142684737): The multi-processing API might change.
      beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
  )
Ejemplo n.º 4
0
    def __init__(self,
                 checkpoint_dir: Text):
        model_dir = external_input("CopyCheckpoint")
        spec = CopyCheckpointSpec(checkpoint_dir=checkpoint_dir,
                                  model_dir=model_dir)

        super(CopyCheckpoint, self).__init__(spec=spec)
Ejemplo n.º 5
0
    def generate_models(self, args):
        # Modified version of Chicago Taxi Example pipeline
        # tfx/examples/chicago_taxi_pipeline/taxi_pipeline_beam.py

        root = tempfile.mkdtemp()
        pipeline_root = os.path.join(root, "pipeline")
        metadata_path = os.path.join(root, "metadata/metadata.db")
        module_file = os.path.join(
            os.path.dirname(__file__),
            "../../../examples/chicago_taxi_pipeline/taxi_utils.py")

        examples = external_input(os.path.dirname(self.dataset_path()))
        example_gen = components.ImportExampleGen(input=examples)
        statistics_gen = components.StatisticsGen(
            examples=example_gen.outputs["examples"])
        schema_gen = components.SchemaGen(
            statistics=statistics_gen.outputs["statistics"],
            infer_feature_shape=False)
        transform = components.Transform(
            examples=example_gen.outputs["examples"],
            schema=schema_gen.outputs["schema"],
            module_file=module_file)
        trainer = components.Trainer(
            module_file=module_file,
            transformed_examples=transform.outputs["transformed_examples"],
            schema=schema_gen.outputs["schema"],
            transform_graph=transform.outputs["transform_graph"],
            train_args=trainer_pb2.TrainArgs(num_steps=100),
            eval_args=trainer_pb2.EvalArgs(num_steps=50))
        p = pipeline.Pipeline(pipeline_name="chicago_taxi_beam",
                              pipeline_root=pipeline_root,
                              components=[
                                  example_gen, statistics_gen, schema_gen,
                                  transform, trainer
                              ],
                              enable_cache=True,
                              metadata_connection_config=metadata.
                              sqlite_metadata_connection_config(metadata_path))
        BeamDagRunner().run(p)

        def join_unique_subdir(path):
            dirs = os.listdir(path)
            if len(dirs) != 1:
                raise ValueError(
                    "expecting there to be only one subdirectory in %s, but "
                    "subdirectories were: %s" % (path, dirs))
            return os.path.join(path, dirs[0])

        trainer_output_dir = join_unique_subdir(
            os.path.join(pipeline_root, "Trainer/output"))
        eval_model_dir = join_unique_subdir(
            os.path.join(trainer_output_dir, "eval_model_dir"))
        serving_model_dir = join_unique_subdir(
            os.path.join(trainer_output_dir,
                         "serving_model_dir/export/chicago-taxi"))

        shutil.rmtree(self.trained_saved_model_path(), ignore_errors=True)
        shutil.rmtree(self.tfma_saved_model_path(), ignore_errors=True)
        shutil.copytree(serving_model_dir, self.trained_saved_model_path())
        shutil.copytree(eval_model_dir, self.tfma_saved_model_path())
Ejemplo n.º 6
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs['output'])

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs['output'],
                                      schema=infer_schema.outputs['output'])

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[example_gen, statistics_gen, infer_schema, validate_stats],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        additional_pipeline_args={},
    )
Ejemplo n.º 7
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the imdb sentiment analysis pipline with TFX."""
    examples = external_input(data_root)
    # Brings data in to the pipline
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            schema_gen,
            transform,
        ],
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        enable_cache=True,
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
    )
Ejemplo n.º 8
0
 def testTaxiPipelineNewStyleCompatibility(self):
     examples = external_input('/tmp/fake/path')
     example_gen = CsvExampleGen(input=examples)
     self.assertIs(example_gen.inputs['input'],
                   example_gen.inputs['input_base'])
     statistics_gen = StatisticsGen(
         examples=example_gen.outputs['examples'])
     self.assertIs(statistics_gen.inputs['examples'],
                   statistics_gen.inputs['input_data'])
     infer_schema = SchemaGen(
         statistics=statistics_gen.outputs['statistics'])
     self.assertIs(infer_schema.inputs['statistics'],
                   infer_schema.inputs['stats'])
     self.assertIs(infer_schema.outputs['schema'],
                   infer_schema.outputs['output'])
     validate_examples = ExampleValidator(
         statistics=statistics_gen.outputs['statistics'],
         schema=infer_schema.outputs['schema'])
     self.assertIs(validate_examples.inputs['statistics'],
                   validate_examples.inputs['stats'])
     self.assertIs(validate_examples.outputs['anomalies'],
                   validate_examples.outputs['output'])
     transform = Transform(examples=example_gen.outputs['examples'],
                           schema=infer_schema.outputs['schema'],
                           module_file='/tmp/fake/module/file')
     self.assertIs(transform.inputs['examples'],
                   transform.inputs['input_data'])
     self.assertIs(transform.outputs['transform_graph'],
                   transform.outputs['transform_output'])
     trainer = Trainer(
         module_file='/tmp/fake/module/file',
         transformed_examples=transform.outputs['transformed_examples'],
         schema=infer_schema.outputs['schema'],
         transform_graph=transform.outputs['transform_graph'],
         train_args=trainer_pb2.TrainArgs(num_steps=10000),
         eval_args=trainer_pb2.EvalArgs(num_steps=5000))
     self.assertIs(trainer.inputs['transform_graph'],
                   trainer.inputs['transform_output'])
     self.assertIs(trainer.outputs['model'], trainer.outputs['output'])
     evaluator = Evaluator(
         examples=example_gen.outputs['examples'],
         model=trainer.outputs['model'],
         feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
             evaluator_pb2.SingleSlicingSpec(
                 column_for_slicing=['trip_start_hour'])
         ]))
     self.assertIs(evaluator.inputs['model'],
                   evaluator.inputs['model_exports'])
     model_validator = ModelValidator(
         examples=example_gen.outputs['examples'],
         model=trainer.outputs['model'])
     pusher = Pusher(model=trainer.outputs['output'],
                     model_blessing=model_validator.outputs['blessing'],
                     push_destination=pusher_pb2.PushDestination(
                         filesystem=pusher_pb2.PushDestination.Filesystem(
                             base_directory='/fake/serving/dir')))
     self.assertIs(pusher.inputs['model'], pusher.inputs['model_export'])
     self.assertIs(pusher.outputs['pushed_model'],
                   pusher.outputs['model_push'])
Ejemplo n.º 9
0
    def testRun(self, mock_publisher):
        mock_publisher.return_value.publish_execution.return_value = {}

        example_gen = FileBasedExampleGen(
            custom_executor_spec=executor_spec.ExecutorClassSpec(
                avro_executor.Executor),
            input=external_input(self.avro_dir_path),
            input_config=self.input_config,
            output_config=self.output_config,
            instance_name='AvroExampleGen')

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        pipeline_root = os.path.join(output_data_dir, 'Test')
        tf.io.gfile.makedirs(pipeline_root)
        pipeline_info = data_types.PipelineInfo(pipeline_name='Test',
                                                pipeline_root=pipeline_root,
                                                run_id='123')

        driver_args = data_types.DriverArgs(enable_cache=True)

        connection_config = metadata_store_pb2.ConnectionConfig()
        connection_config.sqlite.SetInParent()
        metadata_connection = metadata.Metadata(connection_config)

        launcher = in_process_component_launcher.InProcessComponentLauncher.create(
            component=example_gen,
            pipeline_info=pipeline_info,
            driver_args=driver_args,
            metadata_connection=metadata_connection,
            beam_pipeline_args=[],
            additional_pipeline_args={})
        self.assertEqual(
            launcher._component_info.component_type, '.'.join(
                [FileBasedExampleGen.__module__,
                 FileBasedExampleGen.__name__]))

        launcher.launch()
        mock_publisher.return_value.publish_execution.assert_called_once()

        # Get output paths.
        component_id = example_gen.id
        output_path = os.path.join(pipeline_root, component_id, 'examples/1')
        examples = standard_artifacts.Examples()
        examples.uri = output_path
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])

        # Check Avro example gen outputs.
        train_output_file = os.path.join(examples.uri, 'train',
                                         'data_tfrecord-00000-of-00001.gz')
        eval_output_file = os.path.join(examples.uri, 'eval',
                                        'data_tfrecord-00000-of-00001.gz')
        self.assertTrue(tf.io.gfile.exists(train_output_file))
        self.assertTrue(tf.io.gfile.exists(eval_output_file))
        self.assertGreater(
            tf.io.gfile.GFile(train_output_file).size(),
            tf.io.gfile.GFile(eval_output_file).size())
Ejemplo n.º 10
0
    def testIsComponent(self):
        resolver = ResolverNode(instance_name="test_resolver_name",
                                resolver_class=latest_blessed_model_resolver.
                                LatestBlessedModelResolver)
        self.assertFalse(compiler_utils.is_component(resolver))

        example_gen = CsvExampleGen(input=external_input("data_path"))
        self.assertTrue(compiler_utils.is_component(example_gen))
Ejemplo n.º 11
0
    def testIsImporter(self):
        importer = ImporterNode(instance_name="import_schema",
                                source_uri="uri/to/schema",
                                artifact_type=standard_artifacts.Schema)
        self.assertTrue(compiler_utils.is_importer(importer))

        example_gen = CsvExampleGen(input=external_input("data_path"))
        self.assertFalse(compiler_utils.is_importer(example_gen))
Ejemplo n.º 12
0
    def test_run(self, mock_publisher):
        mock_publisher.return_value.publish_execution.return_value = {}

        example_gen = FileBasedExampleGen(
            executor_class=avro_executor.Executor,
            input_base=external_input(self.avro_dir_path),
            input_config=self.input_config,
            output_config=self.output_config,
            name='AvroExampleGenComponent')

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        pipeline_root = os.path.join(output_data_dir, 'Test')
        tf.gfile.MakeDirs(pipeline_root)
        pipeline_info = data_types.PipelineInfo(pipeline_name='Test',
                                                pipeline_root=pipeline_root,
                                                run_id='123')

        driver_args = data_types.DriverArgs(enable_cache=True)

        connection_config = metadata_store_pb2.ConnectionConfig()
        connection_config.sqlite.SetInParent()

        launcher = component_launcher.ComponentLauncher(
            component=example_gen,
            pipeline_info=pipeline_info,
            driver_args=driver_args,
            metadata_connection_config=connection_config,
            additional_pipeline_args={})
        self.assertEqual(
            launcher._component_info.component_type, '.'.join(
                [FileBasedExampleGen.__module__,
                 FileBasedExampleGen.__name__]))

        launcher.launch()
        mock_publisher.return_value.publish_execution.assert_called_once()

        # Get output paths.
        component_id = '.'.join([example_gen.component_name, example_gen.name])
        output_path = os.path.join(pipeline_root, component_id, 'examples/1')
        train_examples = types.TfxArtifact(type_name='ExamplesPath',
                                           split='train')
        train_examples.uri = os.path.join(output_path, 'train')
        eval_examples = types.TfxArtifact(type_name='ExamplesPath',
                                          split='eval')
        eval_examples.uri = os.path.join(output_path, 'eval')

        # Check Avro example gen outputs.
        train_output_file = os.path.join(train_examples.uri,
                                         'data_tfrecord-00000-of-00001.gz')
        eval_output_file = os.path.join(eval_examples.uri,
                                        'data_tfrecord-00000-of-00001.gz')
        self.assertTrue(tf.gfile.Exists(train_output_file))
        self.assertTrue(tf.gfile.Exists(eval_output_file))
        self.assertGreater(
            tf.gfile.GFile(train_output_file).size(),
            tf.gfile.GFile(eval_output_file).size())
Ejemplo n.º 13
0
def create_e2e_components(
    pipeline_root: Text,
    csv_input_location: Text,
    transform_module: Text,
    trainer_module: Text,
) -> List[BaseComponent]:
    """Creates components for a simple Chicago Taxi TFX pipeline for testing.

  Args:
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    transform_module: The location of the transform module file.
    trainer_module: The location of the trainer module file.

  Returns:
    A list of TFX components that constitutes an end-to-end test pipeline.
  """
    examples = dsl_utils.external_input(csv_input_location)

    example_gen = CsvExampleGen(input=examples)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=False)
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])
    transform = Transform(input_data=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=transform_module)
    trainer = Trainer(
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10),
        eval_args=trainer_pb2.EvalArgs(num_steps=5),
        module_file=trainer_module)
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])
    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=model_validator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(pipeline_root, 'model_serving'))))

    return [
        example_gen, statistics_gen, infer_schema, validate_stats, transform,
        trainer, model_analyzer, model_validator, pusher
    ]
Ejemplo n.º 14
0
    def __init__(self, checkpoint_dir: types.Channel,
                 model_path: types.Channel, train_config: Dict):
        export_dir = external_input("ExportToTFServing")

        spec = ExportToTFServingSpec(model_path=model_path,
                                     checkpoint_dir=checkpoint_dir,
                                     export_dir=export_dir,
                                     train_config=train_config)

        super(ExportToTFServing, self).__init__(spec=spec)
Ejemplo n.º 15
0
    def __init__(self,
                 url: Text,
                 rss_feed: types.Channel = None):
        if not rss_feed:
            rss_feed = external_input("rss_feed")

        spec = NewsCrawlerSpec(url=url,
                              rss_feed=rss_feed)

        super(NewsCrawler, self).__init__(spec=spec)
Ejemplo n.º 16
0
    def __init__(self,
                 end_token: Text,
                 text_dir: Optional[Text] = None,
                 encoding: Text = 'utf-8'):
        merged_text_dir = external_input("CreateDataset")

        spec = CreateMergedTextSpec(text_dir=text_dir,
                                    end_token=end_token,
                                    encoding=encoding,
                                    merged_text_dir=merged_text_dir)

        super(CreateMergedText, self).__init__(spec=spec)
Ejemplo n.º 17
0
 def __init__(self,
              merged_text_dir: types.Channel,
              encoding_dir: types.Channel,
              end_token: Text,
              encoding: Text = 'utf-8'):
     dataset_dir = external_input("CreateDataset")
     spec = CreateEncodedDatasetSpec(merged_text_dir=merged_text_dir,
                                     encoding_dir=encoding_dir,
                                     encoding=encoding,
                                     end_token=end_token,
                                     dataset_dir=dataset_dir)
     super(CreateEncodedDataset, self).__init__(spec=spec)
Ejemplo n.º 18
0
 def __init__(self,
              text_path: Text,
              model_path: types.Channel,
              encoding: Text = 'utf-8',
              combine: int = 50000):
     dataset_path = external_input("CreateDataset")
     spec = CreateDatasetSpec(text_path=text_path,
                              model_path=model_path,
                              encoding=encoding,
                              combine=combine,
                              dataset_path=dataset_path)
     super(CreateDataset, self).__init__(spec=spec)
Ejemplo n.º 19
0
def create_pipeline():
    download_data = download_data_component()
    example = CsvExampleGen(external_input(download_data.outputs['data_uri']))
    xgb = xgb_component(
        data=example.outputs['examples']
    )

    component_instances = [
        download_data,
        # example,
        # xgb
    ]
    return component_instances
Ejemplo n.º 20
0
    def testIsResolver(self):
        resv = resolver.Resolver(instance_name="test_resolver_name",
                                 strategy_class=latest_blessed_model_resolver.
                                 LatestBlessedModelResolver)
        self.assertTrue(compiler_utils.is_resolver(resv))
        resv = legacy_resolver_node.ResolverNode(
            instance_name="test_resolver_name",
            resolver_class=latest_blessed_model_resolver.
            LatestBlessedModelResolver)
        self.assertTrue(compiler_utils.is_resolver(resv))

        example_gen = CsvExampleGen(input=external_input("data_path"))
        self.assertFalse(compiler_utils.is_resolver(example_gen))
Ejemplo n.º 21
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text) -> pipeline.Pipeline:
    examples = external_input(data_root)
    input_split = example_gen_pb2.Input(splits=[
        example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'),
        example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord')
    ])
    example_gen = ImportExampleGen(input_base=examples, input_config=input_split)
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)
    validate_stats = ExampleValidator(
        stats=statistics_gen.outputs.output,
        schema=infer_schema.outputs.output)
    transform = Transform(
        input_data=example_gen.outputs.examples,
        schema=infer_schema.outputs.output,
        module_file=module_file)
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=1000),
        eval_args=trainer_pb2.EvalArgs(num_steps=500))
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec()
        ]))
    model_validator = ModelValidator(
        examples=example_gen.outputs.examples, model=trainer.outputs.output)
    pusher = Pusher(
        model_export=trainer.outputs.output,
        model_blessing=model_validator.outputs.blessing,
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats, transform,
            trainer, model_analyzer, model_validator, pusher
        ],
        additional_pipeline_args={
            'tfx_image': 'tensorflow/tfx:0.14.0rc1'
        },
        log_root='/var/tmp/tfx/logs',
    )
Ejemplo n.º 22
0
 def __init__(self,
              merged_text_dir: types.Channel,
              model_dir: types.Channel,
              end_token: Text,
              encoding: Text = 'utf-8',
              text_token_size: int = 50000):
     encoding_dir = external_input("CreateDataset")
     spec = CreateEncodingSpec(model_dir=model_dir,
                               encoding=encoding,
                               end_token=end_token,
                               merged_text_dir=merged_text_dir,
                               text_token_size=text_token_size,
                               encoding_dir=encoding_dir)
     super(CreateEncoding, self).__init__(spec=spec)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    examples = external_input(data_root)
    input_split = example_gen_pb2.Input(splits=[
        example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'),
        example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord')
    ])
    example_gen = ImportExampleGen(input=examples, input_config=input_split)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=True)
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    eval_config = tfma.EvalConfig(slicing_specs=[tfma.SlicingSpec()])
    model_analyzer = Evaluator(examples=example_gen.outputs['examples'],
                               model=trainer.outputs['model'],
                               eval_config=eval_config)

    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
Ejemplo n.º 24
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text,
                     data_root: Text) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[example_gen],
        enable_cache=True,
        additional_pipeline_args={},
    )
Ejemplo n.º 25
0
    def testHasTaskDependency(self):
        example_gen = CsvExampleGen(input=external_input("data_path"))
        statistics_gen = StatisticsGen(
            examples=example_gen.outputs["examples"])
        p1 = pipeline.Pipeline(pipeline_name="fake_name",
                               pipeline_root="fake_root",
                               components=[example_gen, statistics_gen])
        self.assertFalse(compiler_utils.has_task_dependency(p1))

        a = EmptyComponent(name="a").with_id("a")
        statistics_gen.add_downstream_node(a)
        p2 = pipeline.Pipeline(pipeline_name="fake_name",
                               pipeline_root="fake_root",
                               components=[example_gen, statistics_gen, a])
        self.assertTrue(compiler_utils.has_task_dependency(p2))
Ejemplo n.º 26
0
 def __init__(self,
              colnames: Text,
              ip: Text = "mongo",
              port: Text = "27017",
              username: Text = os.environ['MONGO_ROOT_USER'],
              password: Text = os.environ['MONGO_ROOT_PASSWORD'],
              dbname: Text = os.environ['MONGO_DATABASE_NAME']):
     output_dir = external_input("MongoExport")
     spec = MongoExportSpec(ip=ip,
                            port=port,
                            username=username,
                            password=password,
                            dbname=dbname,
                            colname=colnames,
                            output_dir=output_dir)
     super(MongoExport, self).__init__(spec=spec)
Ejemplo n.º 27
0
  def _make_example_gen(self) -> base_component.BaseComponent:
    """Returns a TFX ExampleGen which produces the desired split."""

    splits = []
    for name, value in self._dataset_builder.info.splits.items():
      # Assume there is only one file per split.
      # Filename will be like `'fashion_mnist-test.tfrecord-00000-of-00001'`.
      assert len(value.filenames) == 1
      pattern = value.filenames[0]
      splits.append(example_gen_pb2.Input.Split(name=name, pattern=pattern))

    logging.info('Splits: %s', splits)
    input_config = example_gen_pb2.Input(splits=splits)
    return tfx.ImportExampleGen(
        input=external_input(self._dataset_builder.data_dir),
        input_config=input_config)
Ejemplo n.º 28
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples'])

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[example_gen, statistics_gen],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path))
Ejemplo n.º 29
0
def create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                    beam_pipeline_args: Text) -> pipeline.Pipeline:
    """Custom component demo pipeline."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    hello = component.HelloComponent(
        input_data=example_gen.outputs['examples'], name=u'HelloWorld')

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=hello.outputs['output_data'])

    return pipeline.Pipeline(pipeline_name=pipeline_name,
                             pipeline_root=pipeline_root,
                             components=[example_gen, hello, statistics_gen],
                             enable_cache=True,
                             beam_pipeline_args=beam_pipeline_args)
Ejemplo n.º 30
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the Iris flowers pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Hyperparameter tuning based on the tuner_fn in module_file.
    tuner = Tuner(examples=example_gen.outputs['examples'],
                  schema=infer_schema.outputs['schema'],
                  module_file=module_file)

    # TODO(jyzhao): support trainer and following components.

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            infer_schema,
            validate_stats,
            tuner,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
    )