def testBuildFileBasedExampleGen(self): beam_pipeline_args = ['runner=DataflowRunner'] example_gen = components.CsvExampleGen(input_base='path/to/data/root') deployment_config = pipeline_pb2.PipelineDeploymentConfig() component_defs = {} my_builder = step_builder.StepBuilder( node=example_gen, image='gcr.io/tensorflow/tfx:latest', image_cmds=_TEST_CMDS, beam_pipeline_args=beam_pipeline_args, deployment_config=deployment_config, component_defs=component_defs) actual_step_spec = self._sole(my_builder.build()) actual_component_def = self._sole(component_defs) self.assertProtoEquals( test_utils.get_proto_from_test_data( 'expected_csv_example_gen_component.pbtxt', pipeline_pb2.ComponentSpec()), actual_component_def) self.assertProtoEquals( test_utils.get_proto_from_test_data( 'expected_csv_example_gen_task.pbtxt', pipeline_pb2.PipelineTaskSpec()), actual_step_spec) self.assertProtoEquals( test_utils.get_proto_from_test_data( 'expected_csv_example_gen_executor.pbtxt', pipeline_pb2.PipelineDeploymentConfig()), deployment_config)
def create_e2e_components(csv_input_location: str, ) -> List[BaseComponent]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Because we don't need to run whole pipeline, we will make a very short toy pipeline. Args: csv_input_location: The location of the input data directory. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ example_gen = components.CsvExampleGen(input_base=csv_input_location) statistics_gen = components.StatisticsGen( examples=example_gen.outputs['examples']) schema_gen = components.SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) return [example_gen, statistics_gen, schema_gen]
def __init__(self, name: str, root_dir: str, dataset_name: str, task_type: str, label_key: str, num_classes: int = 0, description: str = ''): if not self._verify_task(task_type): raise ValueError('Invalid task type') self._name = name self._dataset_name = dataset_name self._type = task_type self._num_classes = num_classes self._description = description self._label_key = label_key # TODO(nikhilmehta, weill): Subbenchmarking also appends task.name # to the component_id. Fix this when variable scoping is introduced. self._example_gen = tfx.CsvExampleGen(input_base=os.path.join( root_dir, f'{dataset_name}', 'data'), instance_name=self.name)
def create_pipeline_components( pipeline_root: Text, transform_module: Text, trainer_module: Text, bigquery_query: Text = '', csv_input_location: Text = '', ) -> List[base_node.BaseNode]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. bigquery_query: The query to get input data from BigQuery. If not empty, BigQueryExampleGen will be used. csv_input_location: The location of the input data directory. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ if bool(bigquery_query) == bool(csv_input_location): raise ValueError( 'Exactly one example gen is expected. ', 'Please provide either bigquery_query or csv_input_location.') if bigquery_query: example_gen = big_query_example_gen_component.BigQueryExampleGen( query=bigquery_query) else: example_gen = components.CsvExampleGen(input_base=csv_input_location) statistics_gen = components.StatisticsGen( examples=example_gen.outputs['examples']) schema_gen = components.SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) example_validator = components.ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) transform = components.Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=transform_module) latest_model_resolver = resolver.Resolver( strategy_class=latest_artifacts_resolver.LatestArtifactsResolver, model=channel.Channel(type=standard_artifacts.Model)).with_id( 'Resolver.latest_model_resolver') trainer = components.Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec(Executor), transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], base_model=latest_model_resolver.outputs['model'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module, ) # Get the latest blessed model for model validation. model_resolver = resolver.Resolver( strategy_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=channel.Channel(type=standard_artifacts.Model), model_blessing=channel.Channel( type=standard_artifacts.ModelBlessing)).with_id( 'Resolver.latest_blessed_model_resolver') # Set the TFMA config for Model Evaluation and Validation. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], metrics_specs=[ tfma.MetricsSpec( metrics=[tfma.MetricConfig(class_name='ExampleCount')], thresholds={ 'binary_accuracy': tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.5}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ]) evaluator = components.Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) pusher = components.Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, schema_gen, example_validator, transform, latest_model_resolver, trainer, model_resolver, evaluator, pusher ]