def pre_execution( self, input_dict: Dict[Text, types.Channel], output_dict: Dict[Text, types.Channel], exec_properties: Dict[Text, Any], driver_args: data_types.DriverArgs, pipeline_info: data_types.PipelineInfo, component_info: data_types.ComponentInfo, ) -> data_types.ExecutionDecision: input_artifacts = channel_utils.unwrap_channel_dict(input_dict) output_artifacts = channel_utils.unwrap_channel_dict(output_dict) # Generating missing output artifact URIs for name, artifacts in output_artifacts.items(): for idx, artifact in enumerate(artifacts): if not artifact.uri: suffix = str(idx + 1) if idx > 0 else '' artifact.uri = os.path.join( pipeline_info.pipeline_root, 'artifacts', name + suffix, 'data', ) fileio.makedirs(os.path.dirname(artifact.uri)) return data_types.ExecutionDecision(input_artifacts, output_artifacts, exec_properties, 123, False)
def testRunStatisticsGen(self): # Prepare the paths test_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'components', 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) statistics_split_names_path = os.path.join(output_data_dir, 'statistics.properties', 'split_names') fileio.makedirs(output_data_dir) # Run StatisticsGen run_component.run_component( full_component_class_name='tfx.components.StatisticsGen', examples_uri=os.path.join(test_data_dir, 'csv_example_gen'), examples_split_names=artifact_utils.encode_split_names( ['train', 'eval']), statistics_path=output_data_dir, statistics_split_names_path=statistics_split_names_path, ) # Check the statistics_gen outputs self.assertTrue( fileio.exists( os.path.join(output_data_dir, 'Split-train', 'FeatureStats.pb'))) self.assertTrue( fileio.exists( os.path.join(output_data_dir, 'Split-eval', 'FeatureStats.pb'))) self.assertTrue(os.path.exists(statistics_split_names_path)) self.assertEqual( pathlib.Path(statistics_split_names_path).read_text(), '["train", "eval"]')
def setUp(self): super().setUp() self._test_dir = tempfile.mkdtemp() self._executor_invocation = pipeline_pb2.ExecutorInput() self._executor_invocation.outputs.output_file = _TEST_OUTPUT_METADATA_JSON self._executor_invocation.inputs.parameters[ 'input_base_uri'].string_value = _TEST_INPUT_DIR self._executor_invocation.inputs.parameters[ 'input_config'].string_value = json_format.MessageToJson( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='s1', pattern='span{SPAN}/split1/*'), example_gen_pb2.Input.Split(name='s2', pattern='span{SPAN}/split2/*') ])) self._executor_invocation.outputs.artifacts[ 'examples'].artifacts.append( pipeline_pb2.RuntimeArtifact( type=pipeline_pb2.ArtifactTypeSchema( instance_schema=compiler_utils.get_artifact_schema( standard_artifacts.Examples())))) self._executor_invocation_from_file = fileio.open( os.path.join(os.path.dirname(__file__), 'testdata', 'executor_invocation.json'), 'r').read() self._expected_result_from_file = fileio.open( os.path.join(os.path.dirname(__file__), 'testdata', 'expected_output_metadata.json'), 'r').read() self._olddir = os.getcwd() os.chdir(self._test_dir) fileio.makedirs(os.path.dirname(_TEST_OUTPUT_METADATA_JSON)) fileio.makedirs(os.path.dirname(_TEST_INPUT_DIR))
def testDumpUiMetadata(self): trainer = Trainer(examples=Channel(type=standard_artifacts.Examples), module_file='module_file', train_args=trainer_pb2.TrainArgs(splits=['train'], num_steps=100), eval_args=trainer_pb2.EvalArgs(splits=['eval'], num_steps=50)) model_run = standard_artifacts.ModelRun() model_run.uri = 'model_run_uri' exec_info = data_types.ExecutionInfo( input_dict={}, output_dict={'model_run': [model_run]}, exec_properties={}, execution_id='id') ui_metadata_path = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName, 'json') fileio.makedirs(os.path.dirname(ui_metadata_path)) container_entrypoint._dump_ui_metadata(trainer, exec_info, ui_metadata_path) with open(ui_metadata_path) as f: ui_metadata = json.load(f) self.assertEqual('tensorboard', ui_metadata['outputs'][-1]['type']) self.assertEqual('model_run_uri', ui_metadata['outputs'][-1]['source'])
def _copy_and_replace_placeholder_dir( src: Text, dst: Text, ignore_paths: Set[Text], replace_dict: Dict[Pattern[Text], Text]) -> None: """Copy a directory to destination path and replace the placeholders.""" if not os.path.isdir(dst): if os.path.exists(dst): raise RuntimeError( 'Cannot copy template directory {}. Already a file exists.'. format(src)) fileio.makedirs(dst) for f in os.listdir(src): src_path = os.path.join(src, f) dst_path = os.path.join(dst, f) if src_path in ignore_paths: continue if os.path.isdir(src_path): if f.startswith( '_'): # Excludes __pycache__ and other private folders. continue _copy_and_replace_placeholder_dir(src_path, dst_path, ignore_paths, replace_dict) else: # a file. if f.endswith('.pyc'): # Excludes .pyc continue _copy_and_replace_placeholder_file(src_path, dst_path, replace_dict)
def _save_pipeline(self, pipeline_args: Dict[Text, Any]) -> None: """Creates/updates pipeline folder in the handler directory. Args: pipeline_args: Pipeline details obtained from DSL. """ # Path to pipeline folder in Airflow. handler_pipeline_path = os.path.join( self._handler_home_dir, pipeline_args[labels.PIPELINE_NAME], '') # If updating pipeline, first delete pipeline directory. if fileio.exists(handler_pipeline_path): io_utils.delete_dir(handler_pipeline_path) # Dump pipeline_args to handler pipeline folder as json. fileio.makedirs(handler_pipeline_path) with open(os.path.join(handler_pipeline_path, 'pipeline_args.json'), 'w') as f: json.dump(pipeline_args, f) # Copy dsl to pipeline folder pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH] io_utils.copy_file( pipeline_dsl_path, os.path.join(handler_pipeline_path, os.path.basename(pipeline_dsl_path)))
def copy_template(flags_dict: Dict[str, Any]) -> None: """Copy template flags_dict["model"] to flags_dict["dest_dir"]. Copies all *.py and README files in specified template, and replace the content of the files. Args: flags_dict: Should have pipeline_name, model and dest_dir. """ model = flags_dict[labels.MODEL] pipeline_name = _sanitize_pipeline_name(flags_dict[labels.PIPELINE_NAME]) template_dir = os.path.join(_templates_src_dir(), model) if not os.path.isdir(template_dir): raise ValueError('Model {} does not exist.'.format(model)) destination_dir = flags_dict[labels.DESTINATION_PATH] ignore_paths = { os.path.join(template_dir, x) for x in _IGNORE_FILE_PATHS.get(model, []) } replace_dict = { _IMPORT_FROM_PACKAGE: _IMPORT_FROM_LOCAL_DIR, _PLACEHOLDER_PIPELINE_NAME: pipeline_name, } _copy_and_replace_placeholder_dir(template_dir, destination_dir, ignore_paths, replace_dict) for additional_file in _ADDITIONAL_FILE_PATHS.get(model, []): dst_path = os.path.join(destination_dir, additional_file.dst) fileio.makedirs(os.path.dirname(dst_path)) if additional_file.src.startswith(('http://', 'https://')): urllib.request.urlretrieve(additional_file.src, dst_path) else: src_path = os.path.join(_tfx_src_dir(), additional_file.src) fileio.copy(src_path, dst_path)
def _prepare_output_paths(artifact: types.Artifact): """Create output directories for output artifact.""" if fileio.exists(artifact.uri): msg = 'Output artifact uri %s already exists' % artifact.uri absl.logging.warning(msg) # TODO(b/158689199): We currently simply return as a short-term workaround # to unblock execution retires. A comprehensive solution to guarantee # idempotent executions is needed. return # TODO(b/147242148): Introduce principled artifact structure (directory # or file) definition. if isinstance(artifact, types.ValueArtifact): artifact_dir = os.path.dirname(artifact.uri) else: artifact_dir = artifact.uri # TODO(zhitaoli): Consider refactoring this out into something # which can handle permission bits. absl.logging.debug('Creating output artifact uri %s as directory', artifact_dir) fileio.makedirs(artifact_dir) # TODO(b/147242148): Avoid special-casing the "split_names" property. if artifact.type.PROPERTIES and 'split_names' in artifact.type.PROPERTIES: split_names = artifact_utils.decode_split_names(artifact.split_names) for split in split_names: split_dir = os.path.join(artifact.uri, split) absl.logging.debug('Creating output split %s as directory', split_dir) fileio.makedirs(split_dir)
def testDumpUiMetadata(self): trainer = pipeline_pb2.PipelineNode() trainer.node_info.type.name = 'tfx.components.trainer.component.Trainer' model_run_out_spec = pipeline_pb2.OutputSpec( artifact_spec=pipeline_pb2.OutputSpec.ArtifactSpec( type=metadata_store_pb2.ArtifactType( name=standard_artifacts.ModelRun.TYPE_NAME))) trainer.outputs.outputs['model_run'].CopyFrom(model_run_out_spec) model_run = standard_artifacts.ModelRun() model_run.uri = 'model_run_uri' exec_info = data_types.ExecutionInfo( input_dict={}, output_dict={'model_run': [model_run]}, exec_properties={}, execution_id='id') ui_metadata_path = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName, 'json') fileio.makedirs(os.path.dirname(ui_metadata_path)) container_entrypoint._dump_ui_metadata(trainer, exec_info, ui_metadata_path) with open(ui_metadata_path) as f: ui_metadata = json.load(f) self.assertEqual('tensorboard', ui_metadata['outputs'][-1]['type']) self.assertEqual('model_run_uri', ui_metadata['outputs'][-1]['source'])
def testPipelineSchemaSuccessfulRun(self): # First create a pipeline. flags_dict = { labels.ENGINE_FLAG: self.engine, labels.PIPELINE_DSL_PATH: self.pipeline_path } handler = beam_handler.BeamHandler(flags_dict) handler.create_pipeline() flags_dict = { labels.ENGINE_FLAG: self.engine, labels.PIPELINE_NAME: self.pipeline_name, } handler = beam_handler.BeamHandler(flags_dict) # Create fake schema in pipeline root. component_output_dir = os.path.join(self.pipeline_root, 'SchemaGen') schema_path = base_driver._generate_output_uri( # pylint: disable=protected-access component_output_dir, 'schema', 3) fileio.makedirs(schema_path) with open(os.path.join(schema_path, 'schema.pbtxt'), 'w') as f: f.write('SCHEMA') with self.captureWritesToStream(sys.stdout) as captured: handler.get_schema() curr_dir_path = os.path.abspath('schema.pbtxt') self.assertIn('Path to schema: {}'.format(curr_dir_path), captured.contents()) self.assertIn( '*********SCHEMA FOR {}**********'.format( self.pipeline_name.upper()), captured.contents()) self.assertTrue(fileio.exists(curr_dir_path))
def setUp(self): super(CliKubeflowEndToEndTest, self).setUp() # List of packages installed. self._pip_list = pip_utils.get_package_names() # Check if Kubeflow is installed before running E2E tests. if labels.KUBEFLOW_PACKAGE_NAME not in self._pip_list: sys.exit('Kubeflow not installed.') # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Initialize CLI runner. self.runner = click_testing.CliRunner() # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'testdata') self._testdata_dir_updated = self.tmp_dir fileio.makedirs(self._testdata_dir_updated) self._pipeline_name = ('cli-kubeflow-e2e-test-' + test_utils.generate_random_id()) absl.logging.info('Pipeline name is %s' % self._pipeline_name) self._pipeline_name_v2 = self._pipeline_name + '_v2' orig_pipeline_path = os.path.join(self._testdata_dir, 'test_pipeline_kubeflow_1.py') self._pipeline_path = os.path.join(self._testdata_dir_updated, 'test_pipeline_kubeflow_1.py') self._pipeline_path_v2 = os.path.join(self._testdata_dir_updated, 'test_pipeline_kubeflow_2.py') test_utils.copy_and_change_pipeline_name( orig_pipeline_path, self._pipeline_path, 'chicago_taxi_pipeline_kubeflow', self._pipeline_name) self.assertTrue(fileio.exists(self._pipeline_path)) test_utils.copy_and_change_pipeline_name( orig_pipeline_path, self._pipeline_path_v2, 'chicago_taxi_pipeline_kubeflow', self._pipeline_name_v2) self.assertTrue(fileio.exists(self._pipeline_path_v2)) # Endpoint URL self._endpoint = self._get_endpoint( subprocess.check_output( 'kubectl describe configmap inverse-proxy-config -n kubeflow'. split())) absl.logging.info('ENDPOINT: ' + self._endpoint) self._pipeline_package_path = '{}.tar.gz'.format(self._pipeline_name) try: # Create a kfp client for cleanup after running commands. self._client = kfp.Client(host=self._endpoint) except kfp_server_api.rest.ApiException as err: absl.logging.info(err)
def testRunSchemaGen(self): # Prepare the paths test_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'components', 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) fileio.makedirs(output_data_dir) # Run SchemaGen run_component.run_component( full_component_class_name='tfx.components.SchemaGen', # Testing that we can specify input artifact paths statistics_path=os.path.join(test_data_dir, 'statistics_gen'), # Testing that we can specify artifact properties statistics_split_names=artifact_utils.encode_split_names( ['train', 'eval']), # Testing that we can pass arguments for non-string properties infer_feature_shape='1', # Testing that we can specify output artifact paths schema_path=os.path.join(output_data_dir), ) # Checking the schema_gen outputs self.assertTrue( fileio.exists(os.path.join(output_data_dir, 'schema.pbtxt')))
def setUp(self): super(UtilsTest, self).setUp() # Create input splits. test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self._input_base_path = os.path.join(test_dir, 'input_base') fileio.makedirs(self._input_base_path)
def copy_file(src: Text, dst: Text, overwrite: bool = False): """Copies a single file from source to destination.""" if overwrite and fileio.exists(dst): fileio.remove(dst) dst_dir = os.path.dirname(dst) fileio.makedirs(dst_dir) fileio.copy(src, dst, overwrite=overwrite)
def run(self, pipeline: tfx_pipeline.Pipeline, parameter_values: Optional[Dict[Text, Any]] = None, write_out: Optional[bool] = True) -> Dict[Text, Any]: """Compiles a pipeline DSL object into pipeline file. Args: pipeline: TFX pipeline object. parameter_values: mapping from runtime parameter names to its values. write_out: set to True to actually write out the file to the place designated by output_dir and output_filename. Otherwise return the JSON-serialized pipeline job spec. Returns: Returns the JSON pipeline job spec. Raises: RuntimeError: if trying to write out to a place occupied by an existing file. """ # TODO(b/166343606): Support user-provided labels. # TODO(b/169095387): Deprecate .run() method in favor of the unified API # client. display_name = (self._config.display_name or pipeline.pipeline_info.pipeline_name) pipeline_spec = pipeline_builder.PipelineBuilder( tfx_pipeline=pipeline, default_image=self._config.default_image, default_commands=self._config.default_commands).build() pipeline_spec.sdk_version = 'tfx-{}'.format(version.__version__) pipeline_spec.schema_version = _SCHEMA_VERSION runtime_config = pipeline_builder.RuntimeConfigBuilder( pipeline_info=pipeline.pipeline_info, parameter_values=parameter_values).build() with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'kubeflow_v2'}): result = pipeline_spec_pb2.PipelineJob( display_name=display_name or pipeline.pipeline_info.pipeline_name, labels=telemetry_utils.get_labels_dict(), runtime_config=runtime_config) result.pipeline_spec.update(json_format.MessageToDict(pipeline_spec)) pipeline_json_dict = json_format.MessageToDict(result) if write_out: if fileio.exists( self._output_dir) and not fileio.isdir(self._output_dir): raise RuntimeError('Output path: %s is pointed to a file.' % self._output_dir) if not fileio.exists(self._output_dir): fileio.makedirs(self._output_dir) with fileio.open( os.path.join(self._output_dir, self._output_filename), 'wb') as f: f.write(json.dumps(pipeline_json_dict, sort_keys=True)) return pipeline_json_dict
def _get_tmp_dir(self) -> Text: """Get the temporary directory path.""" if not self._context: raise RuntimeError('No context for the executor') tmp_path = self._context.get_tmp_path() if not fileio.exists(tmp_path): absl.logging.info('Creating temp directory at %s', tmp_path) fileio.makedirs(tmp_path) return tmp_path
def _save_pipeline(self, pipeline_args: Dict[Text, Any], update: bool = False) -> None: """Creates/updates pipeline folder in the handler directory.""" pipeline_name = pipeline_args[labels.PIPELINE_NAME] # Path to pipeline folder. handler_pipeline_path = os.path.join(self._handler_home_dir, pipeline_name) pipeline_package_path = self.flags_dict[labels.PIPELINE_PACKAGE_PATH] if update: pipeline_id = self._get_pipeline_id(pipeline_name) # A timestamp will be appended for the uniqueness of `version_name`. version_name = '{}_{}'.format(pipeline_name, time.strftime('%Y%m%d%H%M%S')) upload_response = self._client.pipeline_uploads.upload_pipeline_version( uploadfile=pipeline_package_path, name=version_name, pipelineid=pipeline_id) pipeline_version_id = upload_response.id experiment_id = self._get_experiment_id(pipeline_name) else: # creating a new pipeline. upload_response = self._client.upload_pipeline( pipeline_package_path=pipeline_package_path, pipeline_name=pipeline_name) pipeline_id = upload_response.id pipeline_version_id = upload_response.default_version.id # Create experiment with pipeline name as experiment name. experiment_name = pipeline_name experiment_id = self._client.create_experiment(experiment_name).id # Display the link to the pipeline detail page in KFP UI. click.echo(upload_response) click.echo('Please access the pipeline detail page at ' '{prefix}/#/pipelines/details/{pipeline_id}'.format( prefix=self._client._get_url_prefix(), # pylint: disable=protected-access pipeline_id=pipeline_id)) # Add pipeline details to pipeline_args. pipeline_args[labels.PIPELINE_NAME] = pipeline_name pipeline_args[labels.PIPELINE_ID] = pipeline_id pipeline_args[labels.PIPELINE_VERSION_ID] = pipeline_version_id pipeline_args[labels.PIPELINE_PACKAGE_PATH] = pipeline_package_path pipeline_args[labels.EXPERIMENT_ID] = experiment_id # Path to pipeline_args.json . pipeline_args_path = os.path.join(handler_pipeline_path, 'pipeline_args.json') # Copy pipeline_args to pipeline folder. fileio.makedirs(handler_pipeline_path) with open(pipeline_args_path, 'w') as f: json.dump(pipeline_args, f)
def side_effect(cmd, stdout, stderr): self.assertLen(cmd, 3) self.assertEqual(sys.executable, cmd[0]) self.assertEqual('sdist', cmd[2]) self.assertEqual(stdout, stderr) setup_file = cmd[1] dist_dir = os.path.join(os.path.dirname(setup_file), 'dist') fileio.makedirs(dist_dir) dest_file = os.path.join(dist_dir, expected_package) fileio.copy(test_file, dest_file)
def write_tfrecord_file(file_name: Text, proto: Message) -> None: """Writes a serialized tfrecord to file.""" try: import tensorflow as tf # pylint: disable=g-import-not-at-top except ModuleNotFoundError as e: raise Exception( 'TensorFlow must be installed to use this functionality.') from e fileio.makedirs(os.path.dirname(file_name)) with tf.io.TFRecordWriter(file_name) as writer: writer.write(proto.SerializeToString())
def setUp(self): super(BaseDriverTest, self).setUp() self._mock_metadata = tf.compat.v1.test.mock.Mock() self._input_dict = { 'input_data': types.Channel(type=_InputArtifact, artifacts=[_InputArtifact()], producer_component_id='c', output_key='k'), 'input_string': types.Channel(type=standard_artifacts.String, artifacts=[ standard_artifacts.String(), standard_artifacts.String() ], producer_component_id='c2', output_key='k2'), } input_dir = os.path.join( os.environ.get('TEST_TMP_DIR', self.get_temp_dir()), self._testMethodName, 'input_dir') # valid input artifacts must have a uri pointing to an existing directory. for key, input_channel in self._input_dict.items(): for index, artifact in enumerate(input_channel.get()): artifact.id = index + 1 uri = os.path.join(input_dir, key, str(artifact.id)) artifact.uri = uri fileio.makedirs(uri) self._output_dict = { 'output_data': types.Channel(type=_OutputArtifact, artifacts=[_OutputArtifact()]), 'output_multi_data': types.Channel(type=_OutputArtifact, matching_channel_name='input_string') } self._input_artifacts = channel_utils.unwrap_channel_dict( self._input_dict) self._output_artifacts = channel_utils.unwrap_channel_dict( self._output_dict) self._exec_properties = { 'key': 'value', } self._execution_id = 100 self._execution = metadata_store_pb2.Execution() self._execution.id = self._execution_id self._context_id = 123 self._driver_args = data_types.DriverArgs(enable_cache=True) self._pipeline_info = data_types.PipelineInfo( pipeline_name='my_pipeline_name', pipeline_root=os.environ.get('TEST_TMP_DIR', self.get_temp_dir()), run_id='my_run_id') self._component_info = data_types.ComponentInfo( component_type='a.b.c', component_id='my_component_id', pipeline_info=self._pipeline_info)
def setUp(self): super().setUp() self._source_data_dir = os.path.join( os.path.dirname( os.path.dirname(os.path.dirname(os.path.dirname(__file__)))), 'components', 'testdata') self._output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) fileio.makedirs(self._output_data_dir) self._model_export = standard_artifacts.Model() self._model_export.uri = os.path.join(self._source_data_dir, 'trainer/current') self._model_blessing = standard_artifacts.ModelBlessing() self._input_dict = { standard_component_specs.MODEL_KEY: [self._model_export], standard_component_specs.MODEL_BLESSING_KEY: [self._model_blessing], } self._model_push = standard_artifacts.PushedModel() self._model_push.uri = os.path.join(self._output_data_dir, 'model_push') fileio.makedirs(self._model_push.uri) self._output_dict = { standard_component_specs.PUSHED_MODEL_KEY: [self._model_push], } # Dict format of exec_properties. custom_config needs to be serialized # before being passed into Do function. self._exec_properties = { 'custom_config': { constants.SERVING_ARGS_KEY: { 'model_name': 'model_name', 'project_id': 'project_id' }, }, 'push_destination': None, } self._container_image_uri_vertex = 'gcr.io/path/to/container' # Dict format of exec_properties for Vertex. custom_config needs to be # serialized before being passed into Do function. self._exec_properties_vertex = { 'custom_config': { constants.SERVING_ARGS_KEY: { 'endpoint_name': 'endpoint_name', 'project_id': 'project_id', }, constants.VERTEX_CONTAINER_IMAGE_URI_KEY: self._container_image_uri_vertex, constants.VERTEX_REGION_KEY: 'us-central1', constants.ENABLE_VERTEX_KEY: True, }, 'push_destination': None, } self._executor = executor.Executor()
def get_stateful_working_directory(self): """Generates stateful working directory given execution id.""" # TODO(b/150979622): We should introduce an id that is not changed across # retires of the same component run to provide better isolation between # "retry" and "new execution". When it is available, introduce it into # statuful working direcotry. stateful_working_dir = os.path.join(self._node_dir, self._pipeline_run_id, _STATEFUL_WORKING_DIR) fileio.makedirs(stateful_working_dir) return stateful_working_dir
def createFiles(self, dir_spec, base_dir=None): if base_dir is None: base_dir = self._base_dir for key, value in dir_spec.items(): full_path = os.path.join(base_dir, key) if isinstance(value, str): io_utils.write_string_file(full_path, value) elif isinstance(value, dict): fileio.makedirs(full_path) self.createFiles(value, base_dir=full_path) else: raise TypeError(f'Invalid directory spec: {dir_spec}')
def testGetRun(self): # Create a pipeline in beam home. handler_pipeline_path = os.path.join( os.environ['BEAM_HOME'], self.pipeline_args[labels.PIPELINE_NAME]) fileio.makedirs(handler_pipeline_path) # Now run the pipeline flags_dict = {labels.ENGINE_FLAG: self.engine, labels.RUN_ID: self.run_id} handler = beam_handler.BeamHandler(flags_dict) with self.captureWritesToStream(sys.stdout) as captured: handler.get_run() self.assertIn('Not supported for beam orchestrator.', captured.contents())
def setUp(self): super().setUp() self.tmp_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) fileio.makedirs(self.tmp_dir) # TODO(b/176196624): Delete following block when we drop support for TF<2.4. # Manually set up exit_stack because absltest.TestCase.setUp() is not called # in TF<2.4. if self._exit_stack is None: self._exit_stack = contextlib.ExitStack() self.addCleanup(self._exit_stack.close)
def testTerminateRun(self): # Create a pipeline in local home. handler_pipeline_path = os.path.join( os.environ['LOCAL_HOME'], self.pipeline_args[labels.PIPELINE_NAME]) fileio.makedirs(handler_pipeline_path) # Now run the pipeline flags_dict = {labels.ENGINE_FLAG: self.engine, labels.RUN_ID: self.run_id} handler = local_handler.LocalHandler(flags_dict) with self.captureWritesToStream(sys.stdout) as captured: handler.terminate_run() self.assertIn('Not supported for local orchestrator.', captured.contents())
def testGetSchema(self): flags_dict = { labels.ENGINE_FLAG: self.engine, labels.PIPELINE_DSL_PATH: self.pipeline_path, labels.ENDPOINT: self.endpoint, labels.IAP_CLIENT_ID: self.iap_client_id, labels.NAMESPACE: self.namespace, labels.PIPELINE_PACKAGE_PATH: self.pipeline_package_path } handler = kubeflow_handler.KubeflowHandler(flags_dict) handler.create_pipeline() flags_dict = { labels.ENGINE_FLAG: self.engine, labels.PIPELINE_NAME: self.pipeline_name, } # No pipeline root handler = kubeflow_handler.KubeflowHandler(flags_dict) with self.assertRaises(SystemExit) as err: handler.get_schema() self.assertEqual( str(err.exception), 'Create a run before inferring schema. If pipeline is already running, then wait for it to successfully finish.' ) # No SchemaGen output. fileio.makedirs(self.pipeline_root) with self.assertRaises(SystemExit) as err: handler.get_schema() self.assertEqual( str(err.exception), 'Either SchemaGen component does not exist or pipeline is still running. If pipeline is running, then wait for it to successfully finish.' ) # Successful pipeline run. # Create fake schema in pipeline root. component_output_dir = os.path.join(self.pipeline_root, 'SchemaGen') schema_path = base_driver._generate_output_uri( # pylint: disable=protected-access component_output_dir, 'schema', 3) fileio.makedirs(schema_path) with open(os.path.join(schema_path, 'schema.pbtxt'), 'w') as f: f.write('SCHEMA') with self.captureWritesToStream(sys.stdout) as captured: handler.get_schema() curr_dir_path = os.path.join(os.getcwd(), 'schema.pbtxt') self.assertIn('Path to schema: {}'.format(curr_dir_path), captured.contents()) self.assertIn( '*********SCHEMA FOR {}**********'.format( self.pipeline_name.upper()), captured.contents()) self.assertTrue(fileio.exists(curr_dir_path))
def setUp(self): # Prepare executor input. serialized_metadata = self._get_text_from_test_data( "executor_invocation.json") metadata_json = json.loads(serialized_metadata) # Mutate the outputFile field. metadata_json["outputs"]["outputFile"] = _TEST_OUTPUT_METADATA_JSON self._serialized_metadata = json.dumps(metadata_json) self._expected_output = json.loads( self._get_text_from_test_data("expected_output_metadata.json")) super(KubeflowV2RunExecutorTest, self).setUp() fileio.makedirs(os.path.dirname(_TEST_OUTPUT_METADATA_JSON))
def _export_fn(estimator, export_path, checkpoint_path, eval_result, is_the_final_export): del estimator, checkpoint_path, eval_result, is_the_final_export path = os.path.join(export_path, BASE_EXPORT_SUBDIR) fileio.makedirs(path) with fileio.open(os.path.join(path, ORIGINAL_SAVED_MODEL), 'w') as f: f.write(str(ORIGINAL_SAVED_MODEL)) assets_path = os.path.join(path, tf.saved_model.ASSETS_DIRECTORY) fileio.makedirs(assets_path) with fileio.open(os.path.join(assets_path, ORIGINAL_VOCAB), 'w') as f: f.write(str(ORIGINAL_VOCAB)) return path
def testCheckPipelinExistenceNotRequired(self): flags_dict = { labels.ENGINE_FLAG: 'beam', labels.PIPELINE_NAME: 'pipeline' } handler = FakeHandler(flags_dict) fileio.makedirs( os.path.join(os.environ['HOME'], 'tfx', 'beam', 'pipeline', '')) with self.assertRaises(SystemExit) as err: handler._check_pipeline_existence(flags_dict[labels.PIPELINE_NAME], required=False) self.assertTrue( str(err.exception), 'Pipeline "{}" already exists.'.format( flags_dict[labels.PIPELINE_NAME]))