def __init__(self, dataset_dir: types.Channel, checkpoint_dir: types.Channel, encoding_dir: types.Channel, model_name: Text, train_config: Dict, encoding: Text = 'utf-8', end_token: Text = ""): trained_checkpoint_dir = external_input("TrainGPT2") sample_dir = external_input("TrainGPT2") tensorboard_dir = external_input("TrainGPT2") hyperparameter_dir = external_input("TrainGPT2") metric_dir = external_input("TrainGPT2") spec = TrainGPT2Spec(dataset_dir=dataset_dir, checkpoint_dir=checkpoint_dir, encoding_dir=encoding_dir, model_name=model_name, train_config=train_config, encoding=encoding, trained_checkpoint_dir=trained_checkpoint_dir, sample_dir=sample_dir, hyperparameter_dir=hyperparameter_dir, metric_dir=metric_dir, tensorboard_dir=tensorboard_dir, end_token=end_token) super(TrainGPT2, self).__init__(spec=spec)
def __init__(self, url): if isinstance(url, str): url = [url] rss_feed = external_input("rss_feed") spec = NewsCrawlerSpec(url=url, rss_feed=rss_feed) super(NewsCrawler, self).__init__(spec=spec)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def __init__(self, checkpoint_dir: Text): model_dir = external_input("CopyCheckpoint") spec = CopyCheckpointSpec(checkpoint_dir=checkpoint_dir, model_dir=model_dir) super(CopyCheckpoint, self).__init__(spec=spec)
def generate_models(self, args): # Modified version of Chicago Taxi Example pipeline # tfx/examples/chicago_taxi_pipeline/taxi_pipeline_beam.py root = tempfile.mkdtemp() pipeline_root = os.path.join(root, "pipeline") metadata_path = os.path.join(root, "metadata/metadata.db") module_file = os.path.join( os.path.dirname(__file__), "../../../examples/chicago_taxi_pipeline/taxi_utils.py") examples = external_input(os.path.dirname(self.dataset_path())) example_gen = components.ImportExampleGen(input=examples) statistics_gen = components.StatisticsGen( examples=example_gen.outputs["examples"]) schema_gen = components.SchemaGen( statistics=statistics_gen.outputs["statistics"], infer_feature_shape=False) transform = components.Transform( examples=example_gen.outputs["examples"], schema=schema_gen.outputs["schema"], module_file=module_file) trainer = components.Trainer( module_file=module_file, transformed_examples=transform.outputs["transformed_examples"], schema=schema_gen.outputs["schema"], transform_graph=transform.outputs["transform_graph"], train_args=trainer_pb2.TrainArgs(num_steps=100), eval_args=trainer_pb2.EvalArgs(num_steps=50)) p = pipeline.Pipeline(pipeline_name="chicago_taxi_beam", pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, transform, trainer ], enable_cache=True, metadata_connection_config=metadata. sqlite_metadata_connection_config(metadata_path)) BeamDagRunner().run(p) def join_unique_subdir(path): dirs = os.listdir(path) if len(dirs) != 1: raise ValueError( "expecting there to be only one subdirectory in %s, but " "subdirectories were: %s" % (path, dirs)) return os.path.join(path, dirs[0]) trainer_output_dir = join_unique_subdir( os.path.join(pipeline_root, "Trainer/output")) eval_model_dir = join_unique_subdir( os.path.join(trainer_output_dir, "eval_model_dir")) serving_model_dir = join_unique_subdir( os.path.join(trainer_output_dir, "serving_model_dir/export/chicago-taxi")) shutil.rmtree(self.trained_saved_model_path(), ignore_errors=True) shutil.rmtree(self.tfma_saved_model_path(), ignore_errors=True) shutil.copytree(serving_model_dir, self.trained_saved_model_path()) shutil.copytree(eval_model_dir, self.tfma_saved_model_path())
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs['output']) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs['output'], schema=infer_schema.outputs['output']) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, statistics_gen, infer_schema, validate_stats], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the imdb sentiment analysis pipline with TFX.""" examples = external_input(data_root) # Brings data in to the pipline example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, transform, ], metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), enable_cache=True, beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def testTaxiPipelineNewStyleCompatibility(self): examples = external_input('/tmp/fake/path') example_gen = CsvExampleGen(input=examples) self.assertIs(example_gen.inputs['input'], example_gen.inputs['input_base']) statistics_gen = StatisticsGen( examples=example_gen.outputs['examples']) self.assertIs(statistics_gen.inputs['examples'], statistics_gen.inputs['input_data']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics']) self.assertIs(infer_schema.inputs['statistics'], infer_schema.inputs['stats']) self.assertIs(infer_schema.outputs['schema'], infer_schema.outputs['output']) validate_examples = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) self.assertIs(validate_examples.inputs['statistics'], validate_examples.inputs['stats']) self.assertIs(validate_examples.outputs['anomalies'], validate_examples.outputs['output']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file='/tmp/fake/module/file') self.assertIs(transform.inputs['examples'], transform.inputs['input_data']) self.assertIs(transform.outputs['transform_graph'], transform.outputs['transform_output']) trainer = Trainer( module_file='/tmp/fake/module/file', transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) self.assertIs(trainer.inputs['transform_graph'], trainer.inputs['transform_output']) self.assertIs(trainer.outputs['model'], trainer.outputs['output']) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) self.assertIs(evaluator.inputs['model'], evaluator.inputs['model_exports']) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher(model=trainer.outputs['output'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory='/fake/serving/dir'))) self.assertIs(pusher.inputs['model'], pusher.inputs['model_export']) self.assertIs(pusher.outputs['pushed_model'], pusher.outputs['model_push'])
def testRun(self, mock_publisher): mock_publisher.return_value.publish_execution.return_value = {} example_gen = FileBasedExampleGen( custom_executor_spec=executor_spec.ExecutorClassSpec( avro_executor.Executor), input=external_input(self.avro_dir_path), input_config=self.input_config, output_config=self.output_config, instance_name='AvroExampleGen') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) pipeline_root = os.path.join(output_data_dir, 'Test') tf.io.gfile.makedirs(pipeline_root) pipeline_info = data_types.PipelineInfo(pipeline_name='Test', pipeline_root=pipeline_root, run_id='123') driver_args = data_types.DriverArgs(enable_cache=True) connection_config = metadata_store_pb2.ConnectionConfig() connection_config.sqlite.SetInParent() metadata_connection = metadata.Metadata(connection_config) launcher = in_process_component_launcher.InProcessComponentLauncher.create( component=example_gen, pipeline_info=pipeline_info, driver_args=driver_args, metadata_connection=metadata_connection, beam_pipeline_args=[], additional_pipeline_args={}) self.assertEqual( launcher._component_info.component_type, '.'.join( [FileBasedExampleGen.__module__, FileBasedExampleGen.__name__])) launcher.launch() mock_publisher.return_value.publish_execution.assert_called_once() # Get output paths. component_id = example_gen.id output_path = os.path.join(pipeline_root, component_id, 'examples/1') examples = standard_artifacts.Examples() examples.uri = output_path examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) # Check Avro example gen outputs. train_output_file = os.path.join(examples.uri, 'train', 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(examples.uri, 'eval', 'data_tfrecord-00000-of-00001.gz') self.assertTrue(tf.io.gfile.exists(train_output_file)) self.assertTrue(tf.io.gfile.exists(eval_output_file)) self.assertGreater( tf.io.gfile.GFile(train_output_file).size(), tf.io.gfile.GFile(eval_output_file).size())
def testIsComponent(self): resolver = ResolverNode(instance_name="test_resolver_name", resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver) self.assertFalse(compiler_utils.is_component(resolver)) example_gen = CsvExampleGen(input=external_input("data_path")) self.assertTrue(compiler_utils.is_component(example_gen))
def testIsImporter(self): importer = ImporterNode(instance_name="import_schema", source_uri="uri/to/schema", artifact_type=standard_artifacts.Schema) self.assertTrue(compiler_utils.is_importer(importer)) example_gen = CsvExampleGen(input=external_input("data_path")) self.assertFalse(compiler_utils.is_importer(example_gen))
def test_run(self, mock_publisher): mock_publisher.return_value.publish_execution.return_value = {} example_gen = FileBasedExampleGen( executor_class=avro_executor.Executor, input_base=external_input(self.avro_dir_path), input_config=self.input_config, output_config=self.output_config, name='AvroExampleGenComponent') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) pipeline_root = os.path.join(output_data_dir, 'Test') tf.gfile.MakeDirs(pipeline_root) pipeline_info = data_types.PipelineInfo(pipeline_name='Test', pipeline_root=pipeline_root, run_id='123') driver_args = data_types.DriverArgs(enable_cache=True) connection_config = metadata_store_pb2.ConnectionConfig() connection_config.sqlite.SetInParent() launcher = component_launcher.ComponentLauncher( component=example_gen, pipeline_info=pipeline_info, driver_args=driver_args, metadata_connection_config=connection_config, additional_pipeline_args={}) self.assertEqual( launcher._component_info.component_type, '.'.join( [FileBasedExampleGen.__module__, FileBasedExampleGen.__name__])) launcher.launch() mock_publisher.return_value.publish_execution.assert_called_once() # Get output paths. component_id = '.'.join([example_gen.component_name, example_gen.name]) output_path = os.path.join(pipeline_root, component_id, 'examples/1') train_examples = types.TfxArtifact(type_name='ExamplesPath', split='train') train_examples.uri = os.path.join(output_path, 'train') eval_examples = types.TfxArtifact(type_name='ExamplesPath', split='eval') eval_examples.uri = os.path.join(output_path, 'eval') # Check Avro example gen outputs. train_output_file = os.path.join(train_examples.uri, 'data_tfrecord-00000-of-00001.gz') eval_output_file = os.path.join(eval_examples.uri, 'data_tfrecord-00000-of-00001.gz') self.assertTrue(tf.gfile.Exists(train_output_file)) self.assertTrue(tf.gfile.Exists(eval_output_file)) self.assertGreater( tf.gfile.GFile(train_output_file).size(), tf.gfile.GFile(eval_output_file).size())
def create_e2e_components( pipeline_root: Text, csv_input_location: Text, transform_module: Text, trainer_module: Text, ) -> List[BaseComponent]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ examples = dsl_utils.external_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) transform = Transform(input_data=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=transform_module) trainer = Trainer( transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def __init__(self, checkpoint_dir: types.Channel, model_path: types.Channel, train_config: Dict): export_dir = external_input("ExportToTFServing") spec = ExportToTFServingSpec(model_path=model_path, checkpoint_dir=checkpoint_dir, export_dir=export_dir, train_config=train_config) super(ExportToTFServing, self).__init__(spec=spec)
def __init__(self, url: Text, rss_feed: types.Channel = None): if not rss_feed: rss_feed = external_input("rss_feed") spec = NewsCrawlerSpec(url=url, rss_feed=rss_feed) super(NewsCrawler, self).__init__(spec=spec)
def __init__(self, end_token: Text, text_dir: Optional[Text] = None, encoding: Text = 'utf-8'): merged_text_dir = external_input("CreateDataset") spec = CreateMergedTextSpec(text_dir=text_dir, end_token=end_token, encoding=encoding, merged_text_dir=merged_text_dir) super(CreateMergedText, self).__init__(spec=spec)
def __init__(self, merged_text_dir: types.Channel, encoding_dir: types.Channel, end_token: Text, encoding: Text = 'utf-8'): dataset_dir = external_input("CreateDataset") spec = CreateEncodedDatasetSpec(merged_text_dir=merged_text_dir, encoding_dir=encoding_dir, encoding=encoding, end_token=end_token, dataset_dir=dataset_dir) super(CreateEncodedDataset, self).__init__(spec=spec)
def __init__(self, text_path: Text, model_path: types.Channel, encoding: Text = 'utf-8', combine: int = 50000): dataset_path = external_input("CreateDataset") spec = CreateDatasetSpec(text_path=text_path, model_path=model_path, encoding=encoding, combine=combine, dataset_path=dataset_path) super(CreateDataset, self).__init__(spec=spec)
def create_pipeline(): download_data = download_data_component() example = CsvExampleGen(external_input(download_data.outputs['data_uri'])) xgb = xgb_component( data=example.outputs['examples'] ) component_instances = [ download_data, # example, # xgb ] return component_instances
def testIsResolver(self): resv = resolver.Resolver(instance_name="test_resolver_name", strategy_class=latest_blessed_model_resolver. LatestBlessedModelResolver) self.assertTrue(compiler_utils.is_resolver(resv)) resv = legacy_resolver_node.ResolverNode( instance_name="test_resolver_name", resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver) self.assertTrue(compiler_utils.is_resolver(resv)) example_gen = CsvExampleGen(input=external_input("data_path")) self.assertFalse(compiler_utils.is_resolver(example_gen))
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text) -> pipeline.Pipeline: examples = external_input(data_root) input_split = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'), example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord') ]) example_gen = ImportExampleGen(input_base=examples, input_config=input_split) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output) validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=module_file) trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=1000), eval_args=trainer_pb2.EvalArgs(num_steps=500)) model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec() ])) model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], additional_pipeline_args={ 'tfx_image': 'tensorflow/tfx:0.14.0rc1' }, log_root='/var/tmp/tfx/logs', )
def __init__(self, merged_text_dir: types.Channel, model_dir: types.Channel, end_token: Text, encoding: Text = 'utf-8', text_token_size: int = 50000): encoding_dir = external_input("CreateDataset") spec = CreateEncodingSpec(model_dir=model_dir, encoding=encoding, end_token=end_token, merged_text_dir=merged_text_dir, text_token_size=text_token_size, encoding_dir=encoding_dir) super(CreateEncoding, self).__init__(spec=spec)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int) -> pipeline.Pipeline: examples = external_input(data_root) input_split = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'), example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord') ]) example_gen = ImportExampleGen(input=examples, input_config=input_split) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) eval_config = tfma.EvalConfig(slicing_specs=[tfma.SlicingSpec()]) model_analyzer = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config) model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen], enable_cache=True, additional_pipeline_args={}, )
def testHasTaskDependency(self): example_gen = CsvExampleGen(input=external_input("data_path")) statistics_gen = StatisticsGen( examples=example_gen.outputs["examples"]) p1 = pipeline.Pipeline(pipeline_name="fake_name", pipeline_root="fake_root", components=[example_gen, statistics_gen]) self.assertFalse(compiler_utils.has_task_dependency(p1)) a = EmptyComponent(name="a").with_id("a") statistics_gen.add_downstream_node(a) p2 = pipeline.Pipeline(pipeline_name="fake_name", pipeline_root="fake_root", components=[example_gen, statistics_gen, a]) self.assertTrue(compiler_utils.has_task_dependency(p2))
def __init__(self, colnames: Text, ip: Text = "mongo", port: Text = "27017", username: Text = os.environ['MONGO_ROOT_USER'], password: Text = os.environ['MONGO_ROOT_PASSWORD'], dbname: Text = os.environ['MONGO_DATABASE_NAME']): output_dir = external_input("MongoExport") spec = MongoExportSpec(ip=ip, port=port, username=username, password=password, dbname=dbname, colname=colnames, output_dir=output_dir) super(MongoExport, self).__init__(spec=spec)
def _make_example_gen(self) -> base_component.BaseComponent: """Returns a TFX ExampleGen which produces the desired split.""" splits = [] for name, value in self._dataset_builder.info.splits.items(): # Assume there is only one file per split. # Filename will be like `'fashion_mnist-test.tfrecord-00000-of-00001'`. assert len(value.filenames) == 1 pattern = value.filenames[0] splits.append(example_gen_pb2.Input.Split(name=name, pattern=pattern)) logging.info('Splits: %s', splits) input_config = example_gen_pb2.Input(splits=splits) return tfx.ImportExampleGen( input=external_input(self._dataset_builder.data_dir), input_config=input_config)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, statistics_gen], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path))
def create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, beam_pipeline_args: Text) -> pipeline.Pipeline: """Custom component demo pipeline.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) hello = component.HelloComponent( input_data=example_gen.outputs['examples'], name=u'HelloWorld') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=hello.outputs['output_data']) return pipeline.Pipeline(pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, hello, statistics_gen], enable_cache=True, beam_pipeline_args=beam_pipeline_args)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Hyperparameter tuning based on the tuner_fn in module_file. tuner = Tuner(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # TODO(jyzhao): support trainer and following components. return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, tuner, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), )