Example #1
0
class MetaFeatureGen(base_component.BaseComponent):
    """Custom MetaFeatureGen that generated meta-features for the dataset."""

    SPEC_CLASS = MetaFeatureGenSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(
        executor.MetaFeatureGenExecutor)

    def __init__(self,
                 statistics: types.Channel = None,
                 transformed_examples: Optional[types.Channel] = None,
                 custom_config: Optional[Dict[str, Any]] = None,
                 instance_name: Optional[str] = None):
        """Construct a MetaFeatureGen component.

    Args:
      statistics: Output channel from StatisticsGen.
      transformed_examples: Optional channel from tfx Transform component.
      custom_config: Optional dict which contains addtional parameters.
      instance_name: Optional unique instance name. Necessary if multiple
        MetaFeatureGen components are declared in the same pipeline.
    """

        metafeatures = types.Channel(type=artifacts.MetaFeatures,
                                     artifacts=[artifacts.MetaFeatures()])
        spec = MetaFeatureGenSpec(metafeatures=metafeatures,
                                  transformed_examples=transformed_examples,
                                  statistics=statistics,
                                  custom_config=custom_config)
        super(MetaFeatureGen, self).__init__(spec=spec,
                                             instance_name=instance_name)
Example #2
0
    def __init__(self,
                 model: Optional[types.Channel] = None,
                 model_blessing: Optional[types.Channel] = None,
                 infra_blessing: Optional[types.Channel] = None,
                 custom_config: Optional[Dict[str, Any]] = None,
                 pushed_model: Optional[types.Channel] = None):
        """Construct a Pusher component.

    Args:
      model: An optional Channel of type `standard_artifacts.Model`, usually
        produced by a Trainer component.
      model_blessing: An optional Channel of type
        `standard_artifacts.ModelBlessing`, usually produced from an Evaluator
        component.
      infra_blessing: An optional Channel of type
        `standard_artifacts.InfraBlessing`, usually produced from an
        InfraValidator component.
      custom_config: A dict which contains the deployment job parameters to be
        passed to cloud-based training platforms. The [Kubeflow example](
        https://github.com/tensorflow/tfx/blob/6ff57e36a7b65818d4598d41e584a42584d361e6/tfx/examples/chicago_taxi_pipeline/taxi_pipeline_kubeflow_gcp.py#L278-L285)
        contains an example how this can be used by custom executors.
      pushed_model: Optional output `standard_artifacts.PushedModel` channel
        with result of push.
    """
        super(Pusher, self).__init__(
            model=model,
            model_blessing=model_blessing,
            infra_blessing=infra_blessing,
            push_destination=None,
            custom_config=custom_config,
            custom_executor_spec=executor_spec.ExecutorClassSpec(
                executor.Executor),
            pushed_model=pushed_model)
Example #3
0
class PartitionGen(base_component.BaseComponent):
    """The PartitionGen component.

    It breaks down data into independent partitions that can
    be manipulated independently.
    """
    SPEC_CLASS = PartitionSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(Executor)

    def __init__(self,
                 statistics: types.Channel,
                 schema: types.Channel,
                 examples: types.Channel,
                 partitions: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        if not partitions:
            partitions_artifact = Partitions()
            partitions = types.Channel(type=Partitions,
                                       artifacts=[partitions_artifact])

        spec = PartitionSpec(statistics=statistics,
                             schema=schema,
                             examples=examples,
                             partitions=partitions)

        super().__init__(spec=spec, instance_name=instance_name)
Example #4
0
class SchemaGen(base_component.BaseComponent):
  """A TFX SchemaGen component to generate a schema from the training data.

  The SchemaGen component uses [TensorFlow Data
  Validation](https://www.tensorflow.org/tfx/data_validation) to
  generate a schema from input statistics.  The following TFX libraries use the
  schema:
    - TensorFlow Data Validation
    - TensorFlow Transform
    - TensorFlow Model Analysis

  In a typical TFX pipeline, the SchemaGen component generates a schema which is
  is consumed by the other pipeline components.

  Please see https://www.tensorflow.org/tfx/data_validation for more details.

  ## Example
  ```
    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'])
  ```
  """
  # TODO(b/123941608): Update pydoc about how to use a user provided schema

  SPEC_CLASS = SchemaGenSpec
  EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(executor.Executor)

  def __init__(self,
               statistics: Optional[types.Channel] = None,
               infer_feature_shape: Optional[bool] = False,
               output: Optional[types.Channel] = None,
               stats: Optional[types.Channel] = None,
               instance_name: Optional[Text] = None):
    """Constructs a SchemaGen component.

    Args:
      statistics: A Channel of `ExampleStatistics` type (required if spec is not
        passed). This should contain at least a `train` split. Other splits are
        currently ignored. _required_
      infer_feature_shape: Boolean value indicating whether or not to infer the
        shape of features. If the feature shape is not inferred, downstream
        Tensorflow Transform component using the schema will parse input
        as tf.SparseTensor.
      output: Output `Schema` channel for schema result.
      stats: Backwards compatibility alias for the 'statistics' argument.
      instance_name: Optional name assigned to this specific instance of
        SchemaGen.  Required only if multiple SchemaGen components are declared
        in the same pipeline.

      Either `statistics` or `stats` must be present in the input arguments.
    """
    statistics = statistics or stats
    output = output or types.Channel(
        type=standard_artifacts.Schema, artifacts=[standard_artifacts.Schema()])

    spec = SchemaGenSpec(
        stats=statistics,
        infer_feature_shape=infer_feature_shape,
        output=output)
    super(SchemaGen, self).__init__(spec=spec, instance_name=instance_name)
Example #5
0
    def testComponentCustomExecutor(self):
        class EmptyComponentSpec(types.ComponentSpec):
            PARAMETERS = {}
            INPUTS = {}
            OUTPUTS = {}

        class MyComponent(base_component.BaseComponent):

            SPEC_CLASS = EmptyComponentSpec
            EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(
                base_executor.BaseExecutor)

        class MyCustomExecutor(base_executor.BaseExecutor):
            pass

        custom_executor_component = MyComponent(
            spec=EmptyComponentSpec(),
            custom_executor_spec=executor_spec.ExecutorClassSpec(
                MyCustomExecutor))
        self.assertEqual(
            custom_executor_component.executor_spec.executor_class,
            MyCustomExecutor)

        with self.assertRaisesRegexp(TypeError,
                                     "should be an instance of ExecutorSpec"):
            MyComponent(spec=EmptyComponentSpec(), custom_executor_spec=object)
Example #6
0
class TrainGPT2(base_component.BaseComponent):
    SPEC_CLASS = TrainGPT2Spec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(Executor)

    def __init__(self,
                 dataset_dir: types.Channel,
                 checkpoint_dir: types.Channel,
                 encoding_dir: types.Channel,
                 model_name: Text,
                 train_config: Dict,
                 encoding: Text = 'utf-8',
                 end_token: Text = ""):
        trained_checkpoint_dir = external_input("TrainGPT2")
        sample_dir = external_input("TrainGPT2")
        tensorboard_dir = external_input("TrainGPT2")
        hyperparameter_dir = external_input("TrainGPT2")
        metric_dir = external_input("TrainGPT2")

        spec = TrainGPT2Spec(dataset_dir=dataset_dir,
                             checkpoint_dir=checkpoint_dir,
                             encoding_dir=encoding_dir,
                             model_name=model_name,
                             train_config=train_config,
                             encoding=encoding,
                             trained_checkpoint_dir=trained_checkpoint_dir,
                             sample_dir=sample_dir,
                             hyperparameter_dir=hyperparameter_dir,
                             metric_dir=metric_dir,
                             tensorboard_dir=tensorboard_dir,
                             end_token=end_token)

        super(TrainGPT2, self).__init__(spec=spec)
Example #7
0
class ExampleValidator(base_component.BaseComponent):
    SPEC_CLASS = ExampleValidatorSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(Executor)

    def __init__(self,
                 statistics: types.Channel = None,
                 schema: types.Channel = None,
                 output: Optional[types.Channel] = None,
                 stats: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """An ExampleValidator component for examples.

     TFX has its ExampleValidator component, and this one uses the same
     executor. The reason for this one to exist, is that the TFX
     component does not allow to specify the splits to use, it just
     assumes `train` and `eval`. This component will be unnecessary
     once TFX Transform allows to set the input and output splits
     as other components do"""

        if stats:
            logging.warning(
                'The "stats" argument to the StatisticsGen component has '
                'been renamed to "statistics" and is deprecated. Please update'
                ' your usage as support for this argument will be removed'
                ' soon.')
            statistics = stats
        anomalies = output or types.Channel(
            type=standard_artifacts.ExampleAnomalies,
            artifacts=[standard_artifacts.ExampleAnomalies()])
        spec = ExampleValidatorSpec(statistics=statistics,
                                    schema=schema,
                                    anomalies=anomalies)
        super(ExampleValidator, self).__init__(spec=spec,
                                               instance_name=instance_name)
class IndexEvaluator(base_component.BaseComponent):

    SPEC_CLASS = IndexEvaluatorSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(
        ScaNNIndexEvaluatorExecutor)

    def __init__(self,
                 examples: types.channel,
                 schema: types.channel,
                 model: types.channel,
                 min_recall: float,
                 max_latency: float,
                 evaluation: Optional[types.Channel] = None,
                 blessing: Optional[types.Channel] = None,
                 instance_name=None):

        blessing = blessing or types.Channel(
            type=standard_artifacts.ModelBlessing,
            artifacts=[standard_artifacts.ModelBlessing()])

        evaluation = evaluation or types.Channel(
            type=standard_artifacts.ModelEvaluation,
            artifacts=[standard_artifacts.ModelEvaluation()])

        spec = IndexEvaluatorSpec(examples=examples,
                                  schema=schema,
                                  model=model,
                                  evaluation=evaluation,
                                  blessing=blessing,
                                  min_recall=min_recall,
                                  max_latency=max_latency)

        super().__init__(spec=spec, instance_name=instance_name)
Example #9
0
class CloudAIBulkInferrerComponent(base_component.BaseComponent):
  """A Cloud AI component to do batch inference on a remote hosted model.

  BulkInferrer component will push a model to Google Cloud AI Platform,
  consume examples data, send request to the remote hosted model,
  and produces the inference results to an external location
  as PredictionLog proto. After inference, it will delete the model from
  Google Cloud AI Platform.

  TODO(b/155325467): Creates a end-to-end test for this component.
  """

  SPEC_CLASS = CloudAIBulkInferrerComponentSpec
  EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(executor.Executor)

  def __init__(self,
               examples: types.Channel = None,
               model: Optional[types.Channel] = None,
               model_blessing: Optional[types.Channel] = None,
               data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec,
                                         Dict[Text, Any]]] = None,
               custom_config: Dict[Text, Any] = None,
               inference_result: Optional[types.Channel] = None,
               instance_name: Optional[Text] = None):
    """Construct an BulkInferrer component.

    Args:
      examples: A Channel of type `standard_artifacts.Examples`, usually
        produced by an ExampleGen component. _required_
      model: A Channel of type `standard_artifacts.Model`, usually produced by
        a Trainer component.
      model_blessing: A Channel of type `standard_artifacts.ModelBlessing`,
        usually produced by a ModelValidator component.
      data_spec: bulk_inferrer_pb2.DataSpec instance that describes data
        selection. If any field is provided as a RuntimeParameter, data_spec
        should be constructed as a dict with the same field names as DataSpec
        proto message.
      custom_config: A dict which contains the deployment job parameters to be
        passed to Google Cloud AI Platform.
        custom_config.ai_platform_serving_args need to contain the serving job
        parameters. For the full set of parameters, refer to
        https://cloud.google.com/ml-engine/reference/rest/v1/projects.models
      inference_result: Channel of type `standard_artifacts.InferenceResult`
        to store the inference results.
      instance_name: Optional name assigned to this specific instance of
        BulkInferrer. Required only if multiple BulkInferrer components are
        declared in the same pipeline.
    """
    inference_result = inference_result or types.Channel(
        type=standard_artifacts.InferenceResult,
        artifacts=[standard_artifacts.InferenceResult()])
    spec = CloudAIBulkInferrerComponentSpec(
        examples=examples,
        model=model,
        model_blessing=model_blessing,
        data_spec=data_spec or bulk_inferrer_pb2.DataSpec(),
        custom_config=json_utils.dumps(custom_config),
        inference_result=inference_result)
    super(CloudAIBulkInferrerComponent, self).__init__(
        spec=spec, instance_name=instance_name)
Example #10
0
class _FakeComponent(base_component.BaseComponent):

    SPEC_CLASS = types.ComponentSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(base_executor.BaseExecutor)

    def __init__(self, spec: types.ComponentSpec):
        super(_FakeComponent, self).__init__(spec=spec)
 def testAIPlatformTrainerPipeline(self):
     """Trainer-only test pipeline on AI Platform Training."""
     pipeline_name = 'kubeflow-aip-trainer-test-{}'.format(
         self._random_id())
     pipeline = self._create_pipeline(pipeline_name, [
         Trainer(custom_executor_spec=executor_spec.ExecutorClassSpec(
             ai_platform_trainer_executor.Executor),
                 module_file=self._taxi_module_file,
                 transformed_examples=self._input_artifacts(
                     pipeline_name, self._test_transformed_examples),
                 schema=self._input_artifacts(pipeline_name,
                                              self._test_schema),
                 transform_output=self._input_artifacts(
                     pipeline_name, self._test_transform_graph),
                 train_args=trainer_pb2.TrainArgs(num_steps=10000),
                 eval_args=trainer_pb2.EvalArgs(num_steps=5000),
                 custom_config={
                     'ai_platform_training_args': {
                         'project':
                         self._gcp_project_id,
                         'region':
                         self._gcp_region,
                         'jobDir':
                         os.path.join(self._pipeline_root(pipeline_name),
                                      'tmp'),
                         'masterConfig': {
                             'imageUri': self._container_image,
                         }
                     }
                 }),
     ])
     self._compile_and_run_pipeline(pipeline)
 def testAIPlatformTrainerPipeline(self):
     """Trainer-only test pipeline on AI Platform Training."""
     pipeline_name = 'kubeflow-aip-trainer-test-{}'.format(
         self._random_id())
     pipeline = self._create_pipeline(pipeline_name, [
         self.schema_importer,
         self.transformed_examples_importer,
         self.transform_graph_importer,
         Trainer(custom_executor_spec=executor_spec.ExecutorClassSpec(
             ai_platform_trainer_executor.Executor),
                 module_file=self._taxi_module_file,
                 transformed_examples=self.transformed_examples_importer.
                 outputs['result'],
                 schema=self.schema_importer.outputs['result'],
                 transform_graph=self.transform_graph_importer.
                 outputs['result'],
                 train_args=trainer_pb2.TrainArgs(num_steps=10),
                 eval_args=trainer_pb2.EvalArgs(num_steps=5),
                 custom_config={
                     ai_platform_trainer_executor.TRAINING_ARGS: {
                         'project':
                         self._gcp_project_id,
                         'region':
                         self._gcp_region,
                         'jobDir':
                         os.path.join(self._pipeline_root(pipeline_name),
                                      'tmp'),
                         'masterConfig': {
                             'imageUri': self._container_image,
                         }
                     }
                 }),
     ])
     self._compile_and_run_pipeline(pipeline)
Example #13
0
class UpdateMongoNews(base_component.BaseComponent):
    SPEC_CLASS = UpdateMongoNewsSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(Executor)

    def __init__(self,
                 ip: Text = None,
                 port: Text = None,
                 username: Text = None,
                 password: Text = None,
                 dbname: Text = None,
                 updated_collections: List = [],
                 update_collections: List = []):

        if not ip:
            ip = "mongo"
        if not port:
            port = "27017"
        if not username:
            username = os.environ['MONGO_ROOT_USER']
        if not password:
            password = os.environ['MONGO_ROOT_PASSWORD']
        if not dbname:
            dbname = os.environ['MONGO_DATABASE_NAME']

        spec = UpdateMongoNewsSpec(ip=ip,
                                   port=port,
                                   username=username,
                                   password=password,
                                   dbname=dbname,
                                   update_collections=update_collections,
                                   updated_collections=updated_collections,
                                   backup_dir="")

        super(UpdateMongoNews, self).__init__(spec=spec)
 def testAIPlatformGenericTrainerPipeline(self):
   """Trainer-only pipeline on AI Platform Training with GenericTrainer."""
   pipeline_name = 'kubeflow-aip-generic-trainer-test-{}'.format(
       self._random_id())
   pipeline = self._create_pipeline(pipeline_name, [
       self.schema_importer,
       self.transformed_examples_importer,
       self.transform_graph_importer,
       Trainer(
           custom_executor_spec=executor_spec.ExecutorClassSpec(
               ai_platform_trainer_executor.GenericExecutor),
           module_file=self._trainer_module,
           transformed_examples=self.transformed_examples_importer
           .outputs['result'],
           schema=self.schema_importer.outputs['result'],
           transform_graph=self.transform_graph_importer.outputs['result'],
           train_args=trainer_pb2.TrainArgs(num_steps=10),
           eval_args=trainer_pb2.EvalArgs(num_steps=5),
           custom_config={
               ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                   self.getCaipTrainingArgs(pipeline_name)
           })
   ])
   self._compile_and_run_pipeline(pipeline)
   self.assertNumberOfTrainerOutputIsOne(pipeline_name)
Example #15
0
class MongoImport(base_component.BaseComponent):
    SPEC_CLASS = MongoImportSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(Executor)

    def __init__(self,
                 rss_feed: types.Channel,
                 colname: Text,
                 ip: Text = None,
                 port: Text = None,
                 username: Text = None,
                 password: Text = None,
                 dbname: Text = None):
        if not ip:
            ip = "127.0.0.1"
        if not port:
            port = "27017"
        if not username:
            username = os.environ['MONGO_ROOT_USER']
        if not password:
            password = os.environ['MONGO_ROOT_PASSWORD']
        if not dbname:
            dbname = os.environ['MONGO_DATABASE_NAME']

        spec = MongoImportSpec(ip=ip,
                               port=port,
                               username=username,
                               password=password,
                               dbname=dbname,
                               rss_feed=rss_feed,
                               colname=colname)

        super(MongoImport, self).__init__(spec=spec)
Example #16
0
class TestPredComponent(base_component.BaseComponent):
    """Custom TFX Hello World Component.

  This custom component class consists of only a constructor.
  """

    SPEC_CLASS = TestPredComponentSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(executor.Executor)

    def __init__(self,
                 examples: types.Channel = None,
                 model: types.Channel = None,
                 output_data: types.Channel = None):
        """Construct a HelloComponent.

    Args:
      input_data: A Channel of type `standard_artifacts.String`.
      output_data: A Channel of type `standard_artifacts.String`.
      name: Optional unique name. Necessary if multiple Hello components are
        declared in the same pipeline.
    """
        if not output_data:
            examples_artifact = standard_artifacts.Examples()
            output_data = channel_utils.as_channel([examples_artifact])

        spec = TestPredComponentSpec(examples=examples,
                                     model=model,
                                     output_data=output_data)
        super(TestPredComponent, self).__init__(spec=spec)
Example #17
0
 def testCanLaunch(self):
     self.assertTrue(
         docker_component_launcher.DockerComponentLauncher.can_launch(
             executor_spec.ExecutorContainerSpec(image='test')))
     self.assertFalse(
         docker_component_launcher.DockerComponentLauncher.can_launch(
             executor_spec.ExecutorClassSpec(base_executor.BaseExecutor)))
Example #18
0
class HelloComponent(base_component.BaseComponent):
    """Custom TFX Hello World Component.
  This custom component class consists of only a constructor.
  """

    SPEC_CLASS = HelloComponentSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(executor.Executor)

    def __init__(self,
                 input_data: types.Channel = None,
                 output_data: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Construct a HelloComponent.
    Args:
      input_data: A Channel of type `standard_artifacts.InferenceResult`.
      output_data: A Channel of type `standard_artifacts.ExternalArtifact`.
      instance_name: Optional unique name. Necessary if multiple Hello components are
        declared in the same pipeline.
    """

        if not output_data:
            examples_artifact = standard_artifacts.ExternalArtifact()
            output_data = channel_utils.as_channel([examples_artifact])

        spec = HelloComponentSpec(input_data=input_data,
                                  output_data=output_data)
        super(HelloComponent, self).__init__(spec=spec,
                                             instance_name=instance_name)
Example #19
0
class OldNewsImport(base_component.BaseComponent):
    SPEC_CLASS = OldNewsImportSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(Executor)

    def __init__(self,
                 backup_dir: Text,
                 ip: Text = None,
                 port: Text = None,
                 username: Text = None,
                 password: Text = None,
                 dbname: Text = None):
        if not ip:
            ip = "mongo"
        if not port:
            port = "27017"
        if not username:
            username = os.environ['MONGO_ROOT_USER']
        if not password:
            password = os.environ['MONGO_ROOT_PASSWORD']
        if not dbname:
            dbname = os.environ['MONGO_DATABASE_NAME']

        spec = OldNewsImportSpec(ip=ip,
                                 port=port,
                                 username=username,
                                 password=password,
                                 dbname=dbname,
                                 backup_dir=backup_dir)

        super(OldNewsImport, self).__init__(spec=spec)
Example #20
0
    def __init__(
        self,
        instance_name: Optional[Text] = None,
        executor_spec: Optional[executor_spec_module.ExecutorSpec] = None,
        driver_class: Optional[Type[base_driver.BaseDriver]] = None,
    ):
        """Initialize a node.

    Args:
      instance_name: Optional unique identifying name for this instance of node
        in the pipeline. Required if two instances of the same node are used in
        the pipeline.
      executor_spec: Optional instance of executor_spec.ExecutorSpec which
        describes how to execute this node (optional, defaults to an empty
        executor indicates no-op.
      driver_class: Optional subclass of base_driver.BaseDriver as a custom
        driver for this node (optional, defaults to base_driver.BaseDriver).
        Nodes usually use the default driver class, but may override it.
    """
        if executor_spec is None:
            executor_spec = executor_spec_module.ExecutorClassSpec(
                base_executor.EmptyExecutor)
        if driver_class is None:
            driver_class = base_driver.BaseDriver
        self._instance_name = instance_name
        self.executor_spec = executor_spec
        self.driver_class = driver_class
        self._upstream_nodes = set()
        self._downstream_nodes = set()
Example #21
0
class _QueryBasedExampleGen(base_component.BaseComponent):
  """A TFX component to ingest examples from a file system.

  The _QueryBasedExampleGen component can be extended to ingest examples from
  query based systems such as Presto or Bigquery. The component will also
  convert the input data into
  tf.record](https://www.tensorflow.org/tutorials/load_data/tf_records)
  and generate train and eval example splits for downsteam components.

  ## Example
  ```
  _query = "SELECT * FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`"
  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = BigQueryExampleGen(query=_query)
  ```
  """

  SPEC_CLASS = QueryBasedExampleGenSpec
  # EXECUTOR_SPEC should be overridden by subclasses.
  EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(base_executor.BaseExecutor)

  def __init__(self,
               input_config: example_gen_pb2.Input,
               output_config: Optional[example_gen_pb2.Output] = None,
               custom_config: Optional[example_gen_pb2.CustomConfig] = None,
               example_artifacts: Optional[types.Channel] = None,
               instance_name: Optional[Text] = None):
    """Construct an QueryBasedExampleGen component.

    Args:
      input_config: An
        [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing input configuration. _required_
      output_config: An
        [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing output configuration. If unset, the default splits
        will be labeled as 'train' and 'eval' with a distribution ratio of 2:1.
      custom_config: An
        [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing custom configuration for ExampleGen.
      example_artifacts: Channel of 'ExamplesPath' for output train and
        eval examples.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
    """
    # Configure outputs.
    output_config = output_config or utils.make_default_output_config(
        input_config)
    example_artifacts = example_artifacts or channel_utils.as_channel([
        standard_artifacts.Examples(split=split_name)
        for split_name in utils.generate_output_split_names(
            input_config, output_config)
    ])
    spec = QueryBasedExampleGenSpec(
        input_config=input_config,
        output_config=output_config,
        custom_config=custom_config,
        examples=example_artifacts)
    super(_QueryBasedExampleGen, self).__init__(
        spec=spec, instance_name=instance_name)
Example #22
0
    def testRun(self, mock_publisher):
        mock_publisher.return_value.publish_execution.return_value = {}

        example_gen = FileBasedExampleGen(
            custom_executor_spec=executor_spec.ExecutorClassSpec(
                avro_executor.Executor),
            input=external_input(self.avro_dir_path),
            input_config=self.input_config,
            output_config=self.output_config,
            instance_name='AvroExampleGen')

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        pipeline_root = os.path.join(output_data_dir, 'Test')
        tf.io.gfile.makedirs(pipeline_root)
        pipeline_info = data_types.PipelineInfo(pipeline_name='Test',
                                                pipeline_root=pipeline_root,
                                                run_id='123')

        driver_args = data_types.DriverArgs(enable_cache=True)

        connection_config = metadata_store_pb2.ConnectionConfig()
        connection_config.sqlite.SetInParent()
        metadata_connection = metadata.Metadata(connection_config)

        launcher = in_process_component_launcher.InProcessComponentLauncher.create(
            component=example_gen,
            pipeline_info=pipeline_info,
            driver_args=driver_args,
            metadata_connection=metadata_connection,
            beam_pipeline_args=[],
            additional_pipeline_args={})
        self.assertEqual(
            launcher._component_info.component_type, '.'.join(
                [FileBasedExampleGen.__module__,
                 FileBasedExampleGen.__name__]))

        launcher.launch()
        mock_publisher.return_value.publish_execution.assert_called_once()

        # Get output paths.
        component_id = example_gen.id
        output_path = os.path.join(pipeline_root, component_id, 'examples/1')
        examples = standard_artifacts.Examples()
        examples.uri = output_path
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])

        # Check Avro example gen outputs.
        train_output_file = os.path.join(examples.uri, 'train',
                                         'data_tfrecord-00000-of-00001.gz')
        eval_output_file = os.path.join(examples.uri, 'eval',
                                        'data_tfrecord-00000-of-00001.gz')
        self.assertTrue(tf.io.gfile.exists(train_output_file))
        self.assertTrue(tf.io.gfile.exists(eval_output_file))
        self.assertGreater(
            tf.io.gfile.GFile(train_output_file).size(),
            tf.io.gfile.GFile(eval_output_file).size())
Example #23
0
class BulkInferrer(base_component.BaseComponent):
    """A TFX component to do batch inference on a model with unlabelled examples.

  BulkInferrer consumes examples data and a model, and produces the inference
  results to an external location as PredictionLog proto.

  BulkInferrer will infer on validated model.

  ## Example
  ```
    # Uses BulkInferrer to inference on examples.
    bulk_inferrer = BulkInferrer(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['output'])
  ```
  """

    SPEC_CLASS = BulkInferrerSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(executor.Executor)

    def __init__(self,
                 examples: types.Channel = None,
                 model_export: Optional[types.Channel] = None,
                 model_blessing: Optional[types.Channel] = None,
                 data_spec: Optional[bulk_inferrer_pb2.DataSpec] = None,
                 model_spec: Optional[bulk_inferrer_pb2.ModelSpec] = None,
                 output: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Construct an BulkInferrer component.

    Args:
      examples: A Channel of 'ExamplesPath' type, usually produced by ExampleGen
        component. _required_
      model_export: A Channel of 'ModelExportPath' type, usually produced by
        Trainer component.
      model_blessing: A Channel of 'ModelBlessingPath' type, usually produced by
        Model Validator component.
      data_spec: bulk_inferrer_pb2.DataSpec instance that describes data
        selection.
      model_spec: bulk_inferrer_pb2.ModelSpec instance that describes model
        specification.
      output: Channel of `InferenceResult` to store the inference results.
      instance_name: Optional name assigned to this specific instance of
        BulkInferrer. Required only if multiple BulkInferrer components are
        declared in the same pipeline.
    """
        output = output or types.Channel(
            type=standard_artifacts.InferenceResult,
            artifacts=[standard_artifacts.InferenceResult()])
        spec = BulkInferrerSpec(examples=examples,
                                model_export=model_export,
                                model_blessing=model_blessing,
                                data_spec=data_spec
                                or bulk_inferrer_pb2.DataSpec(),
                                model_spec=model_spec
                                or bulk_inferrer_pb2.ModelSpec(),
                                output=output)
        super(BulkInferrer, self).__init__(spec=spec,
                                           instance_name=instance_name)
Example #24
0
 def testConstructNoDestinationCustomExecutor(self):
   pusher = component.Pusher(
       model_export=self.model_export,
       model_blessing=self.model_blessing,
       custom_executor_spec=executor_spec.ExecutorClassSpec(
           self._MyCustomPusherExecutor),
   )
   self.assertEqual('ModelPushPath', pusher.outputs['model_push'].type_name)
Example #25
0
 def testConstructCustomExecutor(self):
     example_gen = component.FileBasedExampleGen(
         input_base='path',
         custom_executor_spec=executor_spec.ExecutorClassSpec(
             TestExampleGenExecutor))
     self.assertEqual(driver.Driver, example_gen.driver_class)
     self.assertEqual(standard_artifacts.Examples.TYPE_NAME,
                      example_gen.outputs['examples'].type_name)
Example #26
0
class EmptyComponent(base_component.BaseComponent):

    SPEC_CLASS = EmptyComponentSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(base_executor.BaseExecutor)

    def __init__(self, name):
        super(EmptyComponent, self).__init__(spec=EmptyComponentSpec(),
                                             instance_name=name)
Example #27
0
 def testEnableCache(self):
     input_base = standard_artifacts.ExternalArtifact()
     custom_config = example_gen_pb2.CustomConfig(
         custom_config=any_pb2.Any())
     example_gen_1 = component.FileBasedExampleGen(
         input=channel_utils.as_channel([input_base]),
         custom_config=custom_config,
         custom_executor_spec=executor_spec.ExecutorClassSpec(
             TestExampleGenExecutor))
     self.assertEqual(None, example_gen_1.enable_cache)
     example_gen_2 = component.FileBasedExampleGen(
         input=channel_utils.as_channel([input_base]),
         custom_config=custom_config,
         custom_executor_spec=executor_spec.ExecutorClassSpec(
             TestExampleGenExecutor),
         enable_cache=True)
     self.assertEqual(True, example_gen_2.enable_cache)
Example #28
0
class Tuner(tuner_component.Tuner):
    """TFX component for model hyperparameter tuning on AI Platform Training."""

    # TODO(b/160260359): Decide if custom_executor_spec should be added to
    #                    TunerSpec, or deprecate other use of custom_executor_spec
    #                    and the interface to swap Executor for a component
    #                    entirely, to standarize around custom components.
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(executor.Executor)
Example #29
0
class Tuner(base_component.BaseComponent):
    """A TFX component for model hyperparameter tuning."""

    SPEC_CLASS = TunerSpec
    EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(executor.Executor)

    def __init__(self,
                 examples: types.Channel = None,
                 schema: types.Channel = None,
                 module_file: Optional[Text] = None,
                 tuner_fn: Optional[Text] = None,
                 model: Optional[types.Channel] = None,
                 best_hyperparameters: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Construct a Tuner component.

    Args:
      examples: A Channel of type `standard_artifacts.Examples`, serving as the
        source of examples that are used in tuning (required). Transformed
        examples are not yet supported.
      schema:  A Channel of type `standard_artifacts.Schema`, serving as the
        schema of training and eval data.
      module_file: A path to python module file containing UDF KerasTuner
        definition. Exactly one of 'module_file' or 'tuner_fn' must be supplied.
        The module_file must implement a function named `tuner_fn` at its top
        level. The function takes working dir path, train data path, eval data
        path and tensorflow_metadata.proto.v0.schema_pb2.Schema and generates a
        namedtuple TunerFnResult which contains:
        - 'tuner': A KerasTuner that will be used for tuning.
        - 'train_dataset': A tf.data.Dataset of training data.
        - 'eval_dataset': A tf.data.Dataset of eval data.
      tuner_fn:  A python path to UDF model definition function. See
        'module_file' for the required signature of the UDF. Exactly one of
        'module_file' or 'tuner_fn' must be supplied.
      model: Optional Channel of type `standard_artifacts.Model` for result of
        best model.
      best_hyperparameters: Optional Channel of type
        `standard_artifacts.HyperParameters` for result of the best hparams.
      instance_name: Optional unique instance name. Necessary if multiple Tuner
        components are declared in the same pipeline.
    """
        if bool(module_file) == bool(tuner_fn):
            raise ValueError(
                "Exactly one of 'module_file' or 'tuner_fn' must be supplied")

        model = model or types.Channel(type=standard_artifacts.Model,
                                       artifacts=[standard_artifacts.Model()])
        best_hyperparameters = best_hyperparameters or types.Channel(
            type=standard_artifacts.HyperParameters,
            artifacts=[standard_artifacts.HyperParameters()])
        spec = TunerSpec(examples=examples,
                         schema=schema,
                         module_file=module_file,
                         tuner_fn=tuner_fn,
                         model_export_path=model,
                         best_hyperparameters=best_hyperparameters)
        super(Tuner, self).__init__(spec=spec, instance_name=instance_name)
Example #30
0
 def testConstructNoDestinationCustomExecutor(self):
     pusher = component.Pusher(
         model=self.model,
         model_blessing=self.model_blessing,
         custom_executor_spec=executor_spec.ExecutorClassSpec(
             self._MyCustomPusherExecutor),
     )
     self.assertEqual(standard_artifacts.PushedModel.TYPE_NAME,
                      pusher.outputs['pushed_model'].type_name)