Ejemplo n.º 1
0
    def generate_models(self, args):
        # Modified version of Chicago Taxi Example pipeline
        # tfx/examples/chicago_taxi_pipeline/taxi_pipeline_beam.py

        root = tempfile.mkdtemp()
        pipeline_root = os.path.join(root, "pipeline")
        metadata_path = os.path.join(root, "metadata/metadata.db")
        module_file = os.path.join(
            os.path.dirname(__file__),
            "../../../examples/chicago_taxi_pipeline/taxi_utils.py")

        examples = external_input(os.path.dirname(self.dataset_path()))
        example_gen = components.ImportExampleGen(input=examples)
        statistics_gen = components.StatisticsGen(
            examples=example_gen.outputs["examples"])
        schema_gen = components.SchemaGen(
            statistics=statistics_gen.outputs["statistics"],
            infer_feature_shape=False)
        transform = components.Transform(
            examples=example_gen.outputs["examples"],
            schema=schema_gen.outputs["schema"],
            module_file=module_file)
        trainer = components.Trainer(
            module_file=module_file,
            transformed_examples=transform.outputs["transformed_examples"],
            schema=schema_gen.outputs["schema"],
            transform_graph=transform.outputs["transform_graph"],
            train_args=trainer_pb2.TrainArgs(num_steps=100),
            eval_args=trainer_pb2.EvalArgs(num_steps=50))
        p = pipeline.Pipeline(pipeline_name="chicago_taxi_beam",
                              pipeline_root=pipeline_root,
                              components=[
                                  example_gen, statistics_gen, schema_gen,
                                  transform, trainer
                              ],
                              enable_cache=True,
                              metadata_connection_config=metadata.
                              sqlite_metadata_connection_config(metadata_path))
        BeamDagRunner().run(p)

        def join_unique_subdir(path):
            dirs = os.listdir(path)
            if len(dirs) != 1:
                raise ValueError(
                    "expecting there to be only one subdirectory in %s, but "
                    "subdirectories were: %s" % (path, dirs))
            return os.path.join(path, dirs[0])

        trainer_output_dir = join_unique_subdir(
            os.path.join(pipeline_root, "Trainer/output"))
        eval_model_dir = join_unique_subdir(
            os.path.join(trainer_output_dir, "eval_model_dir"))
        serving_model_dir = join_unique_subdir(
            os.path.join(trainer_output_dir,
                         "serving_model_dir/export/chicago-taxi"))

        shutil.rmtree(self.trained_saved_model_path(), ignore_errors=True)
        shutil.rmtree(self.tfma_saved_model_path(), ignore_errors=True)
        shutil.copytree(serving_model_dir, self.trained_saved_model_path())
        shutil.copytree(eval_model_dir, self.tfma_saved_model_path())
Ejemplo n.º 2
0
def create_pipeline_components(
    pipeline_root: Text,
    transform_module: Text,
    trainer_module: Text,
    bigquery_query: Text = '',
    csv_input_location: Text = '',
) -> List[base_node.BaseNode]:
    """Creates components for a simple Chicago Taxi TFX pipeline for testing.

  Args:
    pipeline_root: The root of the pipeline output.
    transform_module: The location of the transform module file.
    trainer_module: The location of the trainer module file.
    bigquery_query: The query to get input data from BigQuery. If not empty,
      BigQueryExampleGen will be used.
    csv_input_location: The location of the input data directory.

  Returns:
    A list of TFX components that constitutes an end-to-end test pipeline.
  """

    if bool(bigquery_query) == bool(csv_input_location):
        raise ValueError(
            'Exactly one example gen is expected. ',
            'Please provide either bigquery_query or csv_input_location.')

    if bigquery_query:
        example_gen = big_query_example_gen_component.BigQueryExampleGen(
            query=bigquery_query)
    else:
        example_gen = components.CsvExampleGen(input_base=csv_input_location)

    statistics_gen = components.StatisticsGen(
        examples=example_gen.outputs['examples'])
    schema_gen = components.SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=False)
    example_validator = components.ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])
    transform = components.Transform(examples=example_gen.outputs['examples'],
                                     schema=schema_gen.outputs['schema'],
                                     module_file=transform_module)
    latest_model_resolver = resolver.Resolver(
        strategy_class=latest_artifacts_resolver.LatestArtifactsResolver,
        model=channel.Channel(type=standard_artifacts.Model)).with_id(
            'Resolver.latest_model_resolver')
    trainer = components.Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(Executor),
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        base_model=latest_model_resolver.outputs['model'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10),
        eval_args=trainer_pb2.EvalArgs(num_steps=5),
        module_file=trainer_module,
    )
    # Get the latest blessed model for model validation.
    model_resolver = resolver.Resolver(
        strategy_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=channel.Channel(type=standard_artifacts.Model),
        model_blessing=channel.Channel(
            type=standard_artifacts.ModelBlessing)).with_id(
                'Resolver.latest_blessed_model_resolver')
    # Set the TFMA config for Model Evaluation and Validation.
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        metrics_specs=[
            tfma.MetricsSpec(
                metrics=[tfma.MetricConfig(class_name='ExampleCount')],
                thresholds={
                    'binary_accuracy':
                    tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.5}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ])
    evaluator = components.Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        eval_config=eval_config)

    pusher = components.Pusher(
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(pipeline_root, 'model_serving'))))

    return [
        example_gen, statistics_gen, schema_gen, example_validator, transform,
        latest_model_resolver, trainer, model_resolver, evaluator, pusher
    ]
Ejemplo n.º 3
0
    def __init__(self,
                 problem_statement: ps_pb2.ProblemStatement,
                 transformed_examples: types.Channel,
                 transform_graph: types.Channel,
                 schema: types.Channel,
                 train_steps: int,
                 eval_steps: int,
                 use_keras: bool = True,
                 enable_tuning: bool = False,
                 max_sequence_length: Optional[int] = None,
                 instance_name: Optional[str] = None):
        """Constructs an AutoTrainer subpipeline.

    Args:
      problem_statement: ProblemStatement proto identifying the task.
      transformed_examples: A Channel of 'ExamplesPath' type produced from an
        upstream Transform component. The source of examples that are used in
        training and evaluation (required).
      transform_graph: An optional Channel of 'TransformPath' type, serving as
        the input transform graph if present.
      schema:  An optional Channel of 'SchemaPath' type, serving as the schema
        of training and eval data.
      train_steps: Number of steps (batches) to train for.
      eval_steps: Number of steps (batches) to evaluate.
      use_keras: When `True`, uses Keras Models, otherwise uses Estimators.
      enable_tuning: When `True`, performs hyperparameter tuning using the
        built-in `tfx.Tuner` using a tuned search-space.
      max_sequence_length: For seqential prediction tasks. When > 0, the
        trainer will produce a model that will produce sequential prediction of
        this desired length.
      instance_name: Optional unique instance name. Necessary iff multiple Tuner
        components are declared in the same pipeline.

    Raises:
      ValueError: When a required param is not supplied.
    """

        self._instance_name = instance_name
        self._tuner = None
        if enable_tuning:
            # Search over search space of model hyperparameters.
            self._tuner = tfx.Tuner(
                tuner_fn='nitroml.automl.autotrainer.lib.auto_trainer.tuner_fn',
                examples=transformed_examples,
                transform_graph=transform_graph,
                train_args=trainer_pb2.TrainArgs(num_steps=train_steps),
                eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps),
                custom_config={
                    # Pass the problem statement proto as a text proto. Required
                    # since custom_config must be JSON-serializable.
                    'problem_statement':
                    text_format.MessageToString(message=problem_statement,
                                                as_utf8=True),
                },
                instance_name=self.id)

        self._trainer = tfx.Trainer(
            run_fn='nitroml.automl.autotrainer.lib.auto_trainer.run_fn'
            if use_keras else
            'nitroml.automl.autotrainer.lib.auto_estimator_trainer.run_fn',
            custom_executor_spec=(executor_spec.ExecutorClassSpec(
                trainer_executor.GenericExecutor)),
            transformed_examples=transformed_examples,
            transform_graph=transform_graph,
            schema=schema,
            train_args=trainer_pb2.TrainArgs(num_steps=train_steps),
            eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps),
            hyperparameters=self._tuner.outputs.best_hyperparameters
            if self._tuner else None,
            custom_config={
                # Pass the problem statement proto as a text proto. Required
                # since custom_config must be JSON-serializable.
                'problem_statement':
                text_format.MessageToString(message=problem_statement,
                                            as_utf8=True),
                'sequence_length':
                max_sequence_length,
            },
            instance_name=self.id)
Ejemplo n.º 4
0
  def benchmark(self,
                mock_data: bool = False,
                data_dir: str = None,
                use_keras: bool = True,
                enable_tuning: bool = True):

    for i, task in enumerate(
        nitroml.suites.OpenMLCC18(data_dir, mock_data=mock_data)):

      if not mock_data and i not in range(20, 40):
        # Use only 20 of the datasets for now.
        # TODO(nikhilmehta): Create subbenchmarks for all 72 tasks.
        # Kubeflow throws a "Max work worflow size error" when pipeline contains
        # too many components.
        # Track issue: https://github.com/kubeflow/pipelines/issues/4170
        continue

      with self.sub_benchmark(task.name):

        autodata = nitroml.autodata.AutoData(
            task.problem_statement,
            examples=task.train_and_eval_examples,
            preprocessor=nitroml.autodata.BasicPreprocessor())

        pipeline = task.components + autodata.components

        if enable_tuning:
          # Search over search space of model hyperparameters.
          tuner = tfx.Tuner(
              tuner_fn='examples.auto_trainer.tuner_fn',
              examples=autodata.transformed_examples,
              transform_graph=autodata.transform_graph,
              train_args=trainer_pb2.TrainArgs(num_steps=10),
              eval_args=trainer_pb2.EvalArgs(num_steps=5),
              custom_config={
                  # Pass the problem statement proto as a text proto. Required
                  # since custom_config must be JSON-serializable.
                  'problem_statement':
                      text_format.MessageToString(
                          message=task.problem_statement, as_utf8=True),
              })
          pipeline.append(tuner)

        # Define a Trainer to train our model on the given task.
        trainer = tfx.Trainer(
            run_fn='examples.auto_trainer.run_fn'
            if use_keras else 'examples.auto_estimator_trainer.run_fn',
            custom_executor_spec=(executor_spec.ExecutorClassSpec(
                trainer_executor.GenericExecutor)),
            transformed_examples=autodata.transformed_examples,
            transform_graph=autodata.transform_graph,
            schema=autodata.schema,
            train_args=trainer_pb2.TrainArgs(num_steps=10),
            eval_args=trainer_pb2.EvalArgs(num_steps=10),
            hyperparameters=(tuner.outputs.best_hyperparameters
                             if enable_tuning else None),
            custom_config={
                # Pass the problem statement proto as a text proto. Required
                # since custom_config must be JSON-serializable.
                'problem_statement':
                    text_format.MessageToString(
                        message=task.problem_statement, as_utf8=True),
            })

        pipeline.append(trainer)

        # Finally, call evaluate() on the workflow DAG outputs, This will
        # automatically append Evaluators to compute metrics from the given
        # SavedModel and 'eval' TF Examples.
        self.evaluate(
            pipeline,
            examples=task.train_and_eval_examples,
            model=trainer.outputs.model)
Ejemplo n.º 5
0
    def benchmark(self,
                  algorithm: str = None,
                  mock_data: bool = False,
                  data_dir: str = None):
        # TODO(nikhilmehta): Extend this to multiple test datasets using subbenchmarks.

        train_task_names = frozenset([
            'OpenML.connect4', 'OpenML.creditapproval', 'OpenML.creditg',
            'OpenML.cylinderbands', 'OpenML.diabetes'
        ])
        test_task_names = frozenset(['OpenML.dressessales'])
        train_steps = 1000

        if mock_data:
            train_task_names = {'OpenML.mockdata_1'}
            test_task_names = {'OpenML.mockdata_2'}
            train_steps = 10

        train_tasks = []
        test_tasks = []
        for task in nitroml.suites.OpenMLCC18(data_dir, mock_data=mock_data):
            if task.name in train_task_names:
                train_tasks.append(task)
            if task.name in test_task_names:
                test_tasks.append(task)

        pipeline = []
        meta_train_data = {}
        train_autodata_list = []
        for task in train_tasks:
            # Create the autodata instance for this task, which creates Transform,
            # StatisticsGen and SchemaGen component.
            autodata = nitroml.autodata.AutoData(
                task.problem_statement,
                examples=task.train_and_eval_examples,
                preprocessor=nitroml.autodata.BasicPreprocessor(),
                instance_name=f'train.{task.name}')

            # Add a tuner component for each training dataset that finds the optimum H
            # Params.
            tuner = tuner_component.AugmentedTuner(
                tuner_fn='examples.auto_trainer.tuner_fn',
                examples=autodata.transformed_examples,
                transform_graph=autodata.transform_graph,
                train_args=trainer_pb2.TrainArgs(num_steps=train_steps),
                eval_args=trainer_pb2.EvalArgs(num_steps=1),
                custom_config={
                    # Pass the problem statement proto as a text proto. Required
                    # since custom_config must be JSON-serializable.
                    'problem_statement':
                    text_format.MessageToString(message=task.problem_statement,
                                                as_utf8=True),
                },
                instance_name=f'train.{task.name}')
            pipeline += task.components + autodata.components + [tuner]

            train_autodata_list.append(autodata)
            meta_train_data[
                f'hparams_train_{len(train_autodata_list)}'] = tuner.outputs.best_hyperparameters

        # Construct a MetaLearningHelper that creates the metalearning subpipeline.
        metalearner_helper = metalearning_wrapper.MetaLearningWrapper(
            train_autodata_list=train_autodata_list,
            meta_train_data=meta_train_data,
            algorithm=algorithm)
        pipeline += metalearner_helper.pipeline
        self.create_subpipeline_shared_with_subbenchmarks(pipeline)

        for task in test_tasks:
            with self.sub_benchmark(task.name):
                task_pipeline = []
                # Create the autodata instance for the test task.
                autodata = nitroml.autodata.AutoData(
                    task.problem_statement,
                    examples=task.train_and_eval_examples,
                    preprocessor=nitroml.autodata.BasicPreprocessor(),
                    instance_name=f'test.{task.name}')

                test_meta_components, best_hparams = metalearner_helper.create_test_components(
                    autodata, tuner_steps=train_steps)

                # Create a trainer component that utilizes the recommended HParams
                # from the metalearning subpipeline.
                trainer = tfx.Trainer(
                    run_fn='examples.auto_trainer.run_fn',
                    custom_executor_spec=(executor_spec.ExecutorClassSpec(
                        trainer_executor.GenericExecutor)),
                    transformed_examples=autodata.transformed_examples,
                    transform_graph=autodata.transform_graph,
                    schema=autodata.schema,
                    train_args=trainer_pb2.TrainArgs(num_steps=train_steps),
                    eval_args=trainer_pb2.EvalArgs(num_steps=1),
                    hyperparameters=best_hparams,
                    custom_config={
                        # Pass the problem statement proto as a text proto. Required
                        # since custom_config must be JSON-serializable.
                        'problem_statement':
                        text_format.MessageToString(
                            message=task.problem_statement, as_utf8=True),
                    },
                    instance_name=f'test.{task.name}')

                task_pipeline = task.components + autodata.components + test_meta_components + [
                    trainer
                ]

                # Finally, call evaluate() on the workflow DAG outputs, This will
                # automatically append Evaluators to compute metrics from the given
                # SavedModel and 'eval' TF Examples.ss
                self.evaluate(task_pipeline,
                              examples=task.train_and_eval_examples,
                              model=trainer.outputs.model)
Ejemplo n.º 6
0
    def benchmark(self,
                  data_dir: str = None,
                  use_keras: bool = True,
                  enable_tuning: bool = True):
        # Use TFDSTask to define the task for the titanic dataset.
        task = nitroml.tasks.TFDSTask(
            tfds.builder('titanic', data_dir=data_dir))

        autodata = nitroml.autodata.AutoData(
            task.problem_statement,
            examples=task.train_and_eval_examples,
            preprocessor=nitroml.autodata.BasicPreprocessor())

        pipeline = task.components + autodata.components

        if enable_tuning:
            # Search over search space of model hyperparameters.
            tuner = tfx.Tuner(
                tuner_fn='examples.auto_trainer.tuner_fn',
                examples=autodata.transformed_examples,
                transform_graph=autodata.transform_graph,
                train_args=trainer_pb2.TrainArgs(num_steps=100),
                eval_args=trainer_pb2.EvalArgs(num_steps=50),
                custom_config={
                    # Pass the problem statement proto as a text proto. Required
                    # since custom_config must be JSON-serializable.
                    'problem_statement':
                    text_format.MessageToString(message=task.problem_statement,
                                                as_utf8=True),
                })
            pipeline.append(tuner)

        # Define a Trainer to train our model on the given task.
        trainer = tfx.Trainer(
            run_fn='examples.auto_trainer.run_fn'
            if use_keras else 'examples.auto_estimator_trainer.run_fn',
            custom_executor_spec=(executor_spec.ExecutorClassSpec(
                trainer_executor.GenericExecutor)),
            transformed_examples=autodata.transformed_examples,
            transform_graph=autodata.transform_graph,
            schema=autodata.schema,
            train_args=trainer_pb2.TrainArgs(num_steps=1000),
            eval_args=trainer_pb2.EvalArgs(num_steps=500),
            hyperparameters=(tuner.outputs.best_hyperparameters
                             if enable_tuning else None),
            custom_config={
                # Pass the problem statement proto as a text proto. Required
                # since custom_config must be JSON-serializable.
                'problem_statement':
                text_format.MessageToString(message=task.problem_statement,
                                            as_utf8=True),
            })

        pipeline.append(trainer)

        # Finally, call evaluate() on the workflow DAG outputs. This will
        # automatically append Evaluators to compute metrics from the given
        # SavedModel and 'eval' TF Examples.
        self.evaluate(pipeline,
                      examples=task.train_and_eval_examples,
                      model=trainer.outputs.model)