def generate_models(self, args): # Modified version of Chicago Taxi Example pipeline # tfx/examples/chicago_taxi_pipeline/taxi_pipeline_beam.py root = tempfile.mkdtemp() pipeline_root = os.path.join(root, "pipeline") metadata_path = os.path.join(root, "metadata/metadata.db") module_file = os.path.join( os.path.dirname(__file__), "../../../examples/chicago_taxi_pipeline/taxi_utils.py") examples = external_input(os.path.dirname(self.dataset_path())) example_gen = components.ImportExampleGen(input=examples) statistics_gen = components.StatisticsGen( examples=example_gen.outputs["examples"]) schema_gen = components.SchemaGen( statistics=statistics_gen.outputs["statistics"], infer_feature_shape=False) transform = components.Transform( examples=example_gen.outputs["examples"], schema=schema_gen.outputs["schema"], module_file=module_file) trainer = components.Trainer( module_file=module_file, transformed_examples=transform.outputs["transformed_examples"], schema=schema_gen.outputs["schema"], transform_graph=transform.outputs["transform_graph"], train_args=trainer_pb2.TrainArgs(num_steps=100), eval_args=trainer_pb2.EvalArgs(num_steps=50)) p = pipeline.Pipeline(pipeline_name="chicago_taxi_beam", pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, transform, trainer ], enable_cache=True, metadata_connection_config=metadata. sqlite_metadata_connection_config(metadata_path)) BeamDagRunner().run(p) def join_unique_subdir(path): dirs = os.listdir(path) if len(dirs) != 1: raise ValueError( "expecting there to be only one subdirectory in %s, but " "subdirectories were: %s" % (path, dirs)) return os.path.join(path, dirs[0]) trainer_output_dir = join_unique_subdir( os.path.join(pipeline_root, "Trainer/output")) eval_model_dir = join_unique_subdir( os.path.join(trainer_output_dir, "eval_model_dir")) serving_model_dir = join_unique_subdir( os.path.join(trainer_output_dir, "serving_model_dir/export/chicago-taxi")) shutil.rmtree(self.trained_saved_model_path(), ignore_errors=True) shutil.rmtree(self.tfma_saved_model_path(), ignore_errors=True) shutil.copytree(serving_model_dir, self.trained_saved_model_path()) shutil.copytree(eval_model_dir, self.tfma_saved_model_path())
def create_pipeline_components( pipeline_root: Text, transform_module: Text, trainer_module: Text, bigquery_query: Text = '', csv_input_location: Text = '', ) -> List[base_node.BaseNode]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. bigquery_query: The query to get input data from BigQuery. If not empty, BigQueryExampleGen will be used. csv_input_location: The location of the input data directory. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ if bool(bigquery_query) == bool(csv_input_location): raise ValueError( 'Exactly one example gen is expected. ', 'Please provide either bigquery_query or csv_input_location.') if bigquery_query: example_gen = big_query_example_gen_component.BigQueryExampleGen( query=bigquery_query) else: example_gen = components.CsvExampleGen(input_base=csv_input_location) statistics_gen = components.StatisticsGen( examples=example_gen.outputs['examples']) schema_gen = components.SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) example_validator = components.ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) transform = components.Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=transform_module) latest_model_resolver = resolver.Resolver( strategy_class=latest_artifacts_resolver.LatestArtifactsResolver, model=channel.Channel(type=standard_artifacts.Model)).with_id( 'Resolver.latest_model_resolver') trainer = components.Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec(Executor), transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], base_model=latest_model_resolver.outputs['model'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module, ) # Get the latest blessed model for model validation. model_resolver = resolver.Resolver( strategy_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=channel.Channel(type=standard_artifacts.Model), model_blessing=channel.Channel( type=standard_artifacts.ModelBlessing)).with_id( 'Resolver.latest_blessed_model_resolver') # Set the TFMA config for Model Evaluation and Validation. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], metrics_specs=[ tfma.MetricsSpec( metrics=[tfma.MetricConfig(class_name='ExampleCount')], thresholds={ 'binary_accuracy': tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.5}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ]) evaluator = components.Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) pusher = components.Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, schema_gen, example_validator, transform, latest_model_resolver, trainer, model_resolver, evaluator, pusher ]
def __init__(self, problem_statement: ps_pb2.ProblemStatement, transformed_examples: types.Channel, transform_graph: types.Channel, schema: types.Channel, train_steps: int, eval_steps: int, use_keras: bool = True, enable_tuning: bool = False, max_sequence_length: Optional[int] = None, instance_name: Optional[str] = None): """Constructs an AutoTrainer subpipeline. Args: problem_statement: ProblemStatement proto identifying the task. transformed_examples: A Channel of 'ExamplesPath' type produced from an upstream Transform component. The source of examples that are used in training and evaluation (required). transform_graph: An optional Channel of 'TransformPath' type, serving as the input transform graph if present. schema: An optional Channel of 'SchemaPath' type, serving as the schema of training and eval data. train_steps: Number of steps (batches) to train for. eval_steps: Number of steps (batches) to evaluate. use_keras: When `True`, uses Keras Models, otherwise uses Estimators. enable_tuning: When `True`, performs hyperparameter tuning using the built-in `tfx.Tuner` using a tuned search-space. max_sequence_length: For seqential prediction tasks. When > 0, the trainer will produce a model that will produce sequential prediction of this desired length. instance_name: Optional unique instance name. Necessary iff multiple Tuner components are declared in the same pipeline. Raises: ValueError: When a required param is not supplied. """ self._instance_name = instance_name self._tuner = None if enable_tuning: # Search over search space of model hyperparameters. self._tuner = tfx.Tuner( tuner_fn='nitroml.automl.autotrainer.lib.auto_trainer.tuner_fn', examples=transformed_examples, transform_graph=transform_graph, train_args=trainer_pb2.TrainArgs(num_steps=train_steps), eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps), custom_config={ # Pass the problem statement proto as a text proto. Required # since custom_config must be JSON-serializable. 'problem_statement': text_format.MessageToString(message=problem_statement, as_utf8=True), }, instance_name=self.id) self._trainer = tfx.Trainer( run_fn='nitroml.automl.autotrainer.lib.auto_trainer.run_fn' if use_keras else 'nitroml.automl.autotrainer.lib.auto_estimator_trainer.run_fn', custom_executor_spec=(executor_spec.ExecutorClassSpec( trainer_executor.GenericExecutor)), transformed_examples=transformed_examples, transform_graph=transform_graph, schema=schema, train_args=trainer_pb2.TrainArgs(num_steps=train_steps), eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps), hyperparameters=self._tuner.outputs.best_hyperparameters if self._tuner else None, custom_config={ # Pass the problem statement proto as a text proto. Required # since custom_config must be JSON-serializable. 'problem_statement': text_format.MessageToString(message=problem_statement, as_utf8=True), 'sequence_length': max_sequence_length, }, instance_name=self.id)
def benchmark(self, mock_data: bool = False, data_dir: str = None, use_keras: bool = True, enable_tuning: bool = True): for i, task in enumerate( nitroml.suites.OpenMLCC18(data_dir, mock_data=mock_data)): if not mock_data and i not in range(20, 40): # Use only 20 of the datasets for now. # TODO(nikhilmehta): Create subbenchmarks for all 72 tasks. # Kubeflow throws a "Max work worflow size error" when pipeline contains # too many components. # Track issue: https://github.com/kubeflow/pipelines/issues/4170 continue with self.sub_benchmark(task.name): autodata = nitroml.autodata.AutoData( task.problem_statement, examples=task.train_and_eval_examples, preprocessor=nitroml.autodata.BasicPreprocessor()) pipeline = task.components + autodata.components if enable_tuning: # Search over search space of model hyperparameters. tuner = tfx.Tuner( tuner_fn='examples.auto_trainer.tuner_fn', examples=autodata.transformed_examples, transform_graph=autodata.transform_graph, train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), custom_config={ # Pass the problem statement proto as a text proto. Required # since custom_config must be JSON-serializable. 'problem_statement': text_format.MessageToString( message=task.problem_statement, as_utf8=True), }) pipeline.append(tuner) # Define a Trainer to train our model on the given task. trainer = tfx.Trainer( run_fn='examples.auto_trainer.run_fn' if use_keras else 'examples.auto_estimator_trainer.run_fn', custom_executor_spec=(executor_spec.ExecutorClassSpec( trainer_executor.GenericExecutor)), transformed_examples=autodata.transformed_examples, transform_graph=autodata.transform_graph, schema=autodata.schema, train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=10), hyperparameters=(tuner.outputs.best_hyperparameters if enable_tuning else None), custom_config={ # Pass the problem statement proto as a text proto. Required # since custom_config must be JSON-serializable. 'problem_statement': text_format.MessageToString( message=task.problem_statement, as_utf8=True), }) pipeline.append(trainer) # Finally, call evaluate() on the workflow DAG outputs, This will # automatically append Evaluators to compute metrics from the given # SavedModel and 'eval' TF Examples. self.evaluate( pipeline, examples=task.train_and_eval_examples, model=trainer.outputs.model)
def benchmark(self, algorithm: str = None, mock_data: bool = False, data_dir: str = None): # TODO(nikhilmehta): Extend this to multiple test datasets using subbenchmarks. train_task_names = frozenset([ 'OpenML.connect4', 'OpenML.creditapproval', 'OpenML.creditg', 'OpenML.cylinderbands', 'OpenML.diabetes' ]) test_task_names = frozenset(['OpenML.dressessales']) train_steps = 1000 if mock_data: train_task_names = {'OpenML.mockdata_1'} test_task_names = {'OpenML.mockdata_2'} train_steps = 10 train_tasks = [] test_tasks = [] for task in nitroml.suites.OpenMLCC18(data_dir, mock_data=mock_data): if task.name in train_task_names: train_tasks.append(task) if task.name in test_task_names: test_tasks.append(task) pipeline = [] meta_train_data = {} train_autodata_list = [] for task in train_tasks: # Create the autodata instance for this task, which creates Transform, # StatisticsGen and SchemaGen component. autodata = nitroml.autodata.AutoData( task.problem_statement, examples=task.train_and_eval_examples, preprocessor=nitroml.autodata.BasicPreprocessor(), instance_name=f'train.{task.name}') # Add a tuner component for each training dataset that finds the optimum H # Params. tuner = tuner_component.AugmentedTuner( tuner_fn='examples.auto_trainer.tuner_fn', examples=autodata.transformed_examples, transform_graph=autodata.transform_graph, train_args=trainer_pb2.TrainArgs(num_steps=train_steps), eval_args=trainer_pb2.EvalArgs(num_steps=1), custom_config={ # Pass the problem statement proto as a text proto. Required # since custom_config must be JSON-serializable. 'problem_statement': text_format.MessageToString(message=task.problem_statement, as_utf8=True), }, instance_name=f'train.{task.name}') pipeline += task.components + autodata.components + [tuner] train_autodata_list.append(autodata) meta_train_data[ f'hparams_train_{len(train_autodata_list)}'] = tuner.outputs.best_hyperparameters # Construct a MetaLearningHelper that creates the metalearning subpipeline. metalearner_helper = metalearning_wrapper.MetaLearningWrapper( train_autodata_list=train_autodata_list, meta_train_data=meta_train_data, algorithm=algorithm) pipeline += metalearner_helper.pipeline self.create_subpipeline_shared_with_subbenchmarks(pipeline) for task in test_tasks: with self.sub_benchmark(task.name): task_pipeline = [] # Create the autodata instance for the test task. autodata = nitroml.autodata.AutoData( task.problem_statement, examples=task.train_and_eval_examples, preprocessor=nitroml.autodata.BasicPreprocessor(), instance_name=f'test.{task.name}') test_meta_components, best_hparams = metalearner_helper.create_test_components( autodata, tuner_steps=train_steps) # Create a trainer component that utilizes the recommended HParams # from the metalearning subpipeline. trainer = tfx.Trainer( run_fn='examples.auto_trainer.run_fn', custom_executor_spec=(executor_spec.ExecutorClassSpec( trainer_executor.GenericExecutor)), transformed_examples=autodata.transformed_examples, transform_graph=autodata.transform_graph, schema=autodata.schema, train_args=trainer_pb2.TrainArgs(num_steps=train_steps), eval_args=trainer_pb2.EvalArgs(num_steps=1), hyperparameters=best_hparams, custom_config={ # Pass the problem statement proto as a text proto. Required # since custom_config must be JSON-serializable. 'problem_statement': text_format.MessageToString( message=task.problem_statement, as_utf8=True), }, instance_name=f'test.{task.name}') task_pipeline = task.components + autodata.components + test_meta_components + [ trainer ] # Finally, call evaluate() on the workflow DAG outputs, This will # automatically append Evaluators to compute metrics from the given # SavedModel and 'eval' TF Examples.ss self.evaluate(task_pipeline, examples=task.train_and_eval_examples, model=trainer.outputs.model)
def benchmark(self, data_dir: str = None, use_keras: bool = True, enable_tuning: bool = True): # Use TFDSTask to define the task for the titanic dataset. task = nitroml.tasks.TFDSTask( tfds.builder('titanic', data_dir=data_dir)) autodata = nitroml.autodata.AutoData( task.problem_statement, examples=task.train_and_eval_examples, preprocessor=nitroml.autodata.BasicPreprocessor()) pipeline = task.components + autodata.components if enable_tuning: # Search over search space of model hyperparameters. tuner = tfx.Tuner( tuner_fn='examples.auto_trainer.tuner_fn', examples=autodata.transformed_examples, transform_graph=autodata.transform_graph, train_args=trainer_pb2.TrainArgs(num_steps=100), eval_args=trainer_pb2.EvalArgs(num_steps=50), custom_config={ # Pass the problem statement proto as a text proto. Required # since custom_config must be JSON-serializable. 'problem_statement': text_format.MessageToString(message=task.problem_statement, as_utf8=True), }) pipeline.append(tuner) # Define a Trainer to train our model on the given task. trainer = tfx.Trainer( run_fn='examples.auto_trainer.run_fn' if use_keras else 'examples.auto_estimator_trainer.run_fn', custom_executor_spec=(executor_spec.ExecutorClassSpec( trainer_executor.GenericExecutor)), transformed_examples=autodata.transformed_examples, transform_graph=autodata.transform_graph, schema=autodata.schema, train_args=trainer_pb2.TrainArgs(num_steps=1000), eval_args=trainer_pb2.EvalArgs(num_steps=500), hyperparameters=(tuner.outputs.best_hyperparameters if enable_tuning else None), custom_config={ # Pass the problem statement proto as a text proto. Required # since custom_config must be JSON-serializable. 'problem_statement': text_format.MessageToString(message=task.problem_statement, as_utf8=True), }) pipeline.append(trainer) # Finally, call evaluate() on the workflow DAG outputs. This will # automatically append Evaluators to compute metrics from the given # SavedModel and 'eval' TF Examples. self.evaluate(pipeline, examples=task.train_and_eval_examples, model=trainer.outputs.model)