def _PrepareModelPath( self, model_uri: Text, serving_spec: infra_validator_pb2.ServingSpec) -> Text: model_path = path_utils.serving_model_path(model_uri) serving_binary = serving_spec.WhichOneof('serving_binary') if serving_binary == 'tensorflow_serving': # TensorFlow Serving requires model to be stored in its own directory # structure flavor. If current model_path does not conform to the flavor, # we need to make a copy to the temporary path. try: # Check whether current model_path conforms to the tensorflow serving # model path flavor. (Parsed without exception) tf_serving_flavor.parse_model_path( model_path, expected_model_name=serving_spec.model_name) except ValueError: # Copy the model to comply with the tensorflow serving model path # flavor. temp_model_path = tf_serving_flavor.make_model_path( model_base_path=self._get_tmp_dir(), model_name=serving_spec.model_name, version=int(time.time())) io_utils.copy_dir(src=model_path, dst=temp_model_path) return temp_model_path return model_path
def testDoWithCustomSplits(self): # Update input dict. io_utils.copy_dir( os.path.join(self._testdata_dir, 'iris/data/train'), os.path.join(self._output_data_dir, 'data/training')) io_utils.copy_dir( os.path.join(self._testdata_dir, 'iris/data/eval'), os.path.join(self._output_data_dir, 'data/evaluating')) examples = standard_artifacts.Examples() examples.uri = os.path.join(self._output_data_dir, 'data') examples.split_names = artifact_utils.encode_split_names( ['training', 'evaluating']) self._input_dict['examples'] = [examples] # Update exec properties skeleton with custom splits. self._exec_properties['train_args'] = json_format.MessageToJson( trainer_pb2.TrainArgs(splits=['training'], num_steps=1000), preserving_proto_field_name=True) self._exec_properties['eval_args'] = json_format.MessageToJson( trainer_pb2.EvalArgs(splits=['evaluating'], num_steps=500), preserving_proto_field_name=True) self._exec_properties['module_file'] = os.path.join(self._testdata_dir, 'module_file', 'tuner_module.py') tuner = executor.Executor(self._context) tuner.Do( input_dict=self._input_dict, output_dict=self._output_dict, exec_properties=self._exec_properties) self._verify_output()
def setUp(self): super(CliAirflowEndToEndTest, self).setUp() # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home tf.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Do not load examples to make this a bit faster. os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False' # Following environment variables make scheduler process dags faster. os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1' os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '1' os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30' # Using more than one thread results in a warning for sqlite backend. # See https://github.com/tensorflow/tfx/issues/141 os.environ['AIRFLOW__SCHEDULER__MAX_THREADS'] = '1' # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = tf.gfile.ListDirectory(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert tf.gfile.IsDirectory(target_data_dir) content = tf.gfile.ListDirectory(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize database. _ = subprocess.check_output(['airflow', 'initdb']) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def _PrepareModelPath( self, model: types.Artifact, serving_spec: infra_validator_pb2.ServingSpec) -> str: model_path = path_utils.serving_model_path( model.uri, path_utils.is_old_model_artifact(model)) serving_binary = serving_spec.WhichOneof('serving_binary') if serving_binary == _TENSORFLOW_SERVING: # TensorFlow Serving requires model to be stored in its own directory # structure flavor. If current model_path does not conform to the flavor, # we need to make a copy to the temporary path. try: # Check whether current model_path conforms to the tensorflow serving # model path flavor. (Parsed without exception) tf_serving_flavor.parse_model_path( model_path, expected_model_name=serving_spec.model_name) except ValueError: # Copy the model to comply with the tensorflow serving model path # flavor. temp_model_path = tf_serving_flavor.make_model_path( model_base_path=self._get_tmp_dir(), model_name=serving_spec.model_name, version=int(time.time())) io_utils.copy_dir(src=model_path, dst=temp_model_path) self._AddCleanup(io_utils.delete_dir, self._context.get_tmp_path()) return temp_model_path return model_path
def testDoWithCustomSplits(self): # Update input dict. io_utils.copy_dir( os.path.join(self._testdata_dir, 'penguin/data/train'), os.path.join(self._output_data_dir, 'data/training')) io_utils.copy_dir( os.path.join(self._testdata_dir, 'penguin/data/eval'), os.path.join(self._output_data_dir, 'data/evaluating')) examples = standard_artifacts.Examples() examples.uri = os.path.join(self._output_data_dir, 'data') examples.split_names = artifact_utils.encode_split_names( ['training', 'evaluating']) self._input_dict[standard_component_specs.EXAMPLES_KEY] = [examples] # Update exec properties skeleton with custom splits. self._exec_properties[standard_component_specs. TRAIN_ARGS_KEY] = proto_utils.proto_to_json( trainer_pb2.TrainArgs(splits=['training'], num_steps=1000)) self._exec_properties[standard_component_specs. EVAL_ARGS_KEY] = proto_utils.proto_to_json( trainer_pb2.EvalArgs(splits=['evaluating'], num_steps=500)) self._exec_properties[ standard_component_specs.MODULE_FILE_KEY] = os.path.join( self._testdata_dir, 'module_file', 'tuner_module.py') tuner = executor.Executor(self._context) tuner.Do(input_dict=self._input_dict, output_dict=self._output_dict, exec_properties=self._exec_properties) self._verify_output()
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Copies over recorded data to pipeline output uri. Args: input_dict: Input dict from input key to a list of Artifacts. output_dict: Output dict from output key to a list of Artifacts. exec_properties: A dict of execution properties. Returns: None Raises: FileNotFoundError: If the recorded test data dir doesn't exist any more. """ for output_key, artifact_list in output_dict.items(): for idx, artifact in enumerate(artifact_list): dest = artifact.uri src = os.path.join(self._test_data_dir, self._component_id, output_key, str(idx)) if not os.path.exists(src): raise FileNotFoundError("{} does not exist".format(src)) io_utils.copy_dir(src, dest) logging.info("Finished copying from %s to %s", src, dest)
def testDoWithCustomSplits(self): # Update input dict. io_utils.copy_dir( os.path.join(self._source_data_dir, 'transform/transformed_examples/data/train'), os.path.join(self._output_data_dir, 'data/training')) io_utils.copy_dir( os.path.join(self._source_data_dir, 'transform/transformed_examples/data/eval'), os.path.join(self._output_data_dir, 'data/evaluating')) examples = standard_artifacts.Examples() examples.uri = os.path.join(self._output_data_dir, 'data') examples.split_names = artifact_utils.encode_split_names( ['training', 'evaluating']) self._input_dict[constants.EXAMPLES_KEY] = [examples] # Update exec properties skeleton with custom splits. self._exec_properties['train_args'] = json_format.MessageToJson( trainer_pb2.TrainArgs(splits=['training'], num_steps=1000), preserving_proto_field_name=True) self._exec_properties['eval_args'] = json_format.MessageToJson( trainer_pb2.EvalArgs(splits=['evaluating'], num_steps=500), preserving_proto_field_name=True) self._exec_properties['module_file'] = self._module_file self._do(self._trainer_executor) self._verify_model_exports() self._verify_model_run_exports()
def testDoWithCustomSplits(self): # Update input dict. io_utils.copy_dir( os.path.join(self._source_data_dir, 'transform/transformed_examples/data/train'), os.path.join(self._output_data_dir, 'data/training')) io_utils.copy_dir( os.path.join(self._source_data_dir, 'transform/transformed_examples/data/eval'), os.path.join(self._output_data_dir, 'data/evaluating')) examples = standard_artifacts.Examples() examples.uri = os.path.join(self._output_data_dir, 'data') examples.split_names = artifact_utils.encode_split_names( ['training', 'evaluating']) self._input_dict[standard_component_specs.EXAMPLES_KEY] = [examples] # Update exec properties skeleton with custom splits. self._exec_properties[standard_component_specs. TRAIN_ARGS_KEY] = proto_utils.proto_to_json( trainer_pb2.TrainArgs(splits=['training'], num_steps=1000)) self._exec_properties[standard_component_specs. EVAL_ARGS_KEY] = proto_utils.proto_to_json( trainer_pb2.EvalArgs(splits=['evaluating'], num_steps=500)) self._exec_properties[ standard_component_specs.MODULE_FILE_KEY] = self._module_file self._do(self._trainer_executor) self._verify_model_exports() self._verify_model_run_exports()
def _CreateWarmupModel(self, blessing: types.Artifact, model_path: str, warmup_requests: List[iv_types.Request]): output_model_path = path_utils.stamped_model_path(blessing.uri) io_utils.copy_dir(src=model_path, dst=output_model_path) io_utils.write_tfrecord_file( path_utils.warmup_file_path(output_model_path), *[_convert_to_prediction_log(r) for r in warmup_requests]) blessing.set_int_custom_property(_MODEL_FLAG_KEY, 1)
def _create_tflite_compatible_saved_model(src: Text, dst: Text): io_utils.copy_dir(src, dst) assets_path = os.path.join(dst, tf.saved_model.ASSETS_DIRECTORY) if fileio.exists(assets_path): fileio.rmtree(assets_path) assets_extra_path = os.path.join(dst, EXTRA_ASSETS_DIRECTORY) if fileio.exists(assets_extra_path): fileio.rmtree(assets_extra_path)
def record_pipeline(output_dir: Text, metadata_db_uri: Optional[Text] = None, host: Optional[Text] = None, port: Optional[int] = None, pipeline_name: Optional[Text] = None, run_id: Optional[Text] = None) -> None: """Record pipeline run with run_id to output_dir. For the beam pipeline, metadata_db_uri is required. For KFP pipeline, host and port should be specified. If run_id is not specified, then pipeline_name ought to be specified in order to fetch the latest execution for the specified pipeline. Args: output_dir: Directory path where the pipeline outputs should be recorded. metadata_db_uri: Uri to metadata db. host: Hostname of the metadata grpc server port: Port number of the metadata grpc server. pipeline_name: Pipeline name, which is required if run_id isn't specified. run_id: Pipeline execution run_id. Raises: ValueError: In cases of invalid arguments: - metadata_db_uri is None or host and/or port is None. - run_id is None and pipeline_name is None. FileNotFoundError: if the source artifact uri does not already exist. """ if host is not None and port is not None: metadata_config = metadata_store_pb2.MetadataStoreClientConfig( host=host, port=port) elif metadata_db_uri is not None: metadata_config = metadata.sqlite_metadata_connection_config( metadata_db_uri) else: raise ValueError('For KFP, host and port are required. ' 'For beam pipeline, metadata_db_uri is required.') with metadata.Metadata(metadata_config) as metadata_connection: if run_id is None: if pipeline_name is None: raise ValueError('If the run_id is not specified,' ' pipeline_name should be specified') # fetch executions of the most recently updated execution context. executions = _get_latest_executions(metadata_connection, pipeline_name) else: execution_dict = _get_execution_dict(metadata_connection) if run_id in execution_dict: executions = execution_dict[run_id] else: raise ValueError( 'run_id {} is not recorded in the MLMD metadata'.format( run_id)) for src_uri, dest_uri in _get_paths(metadata_connection, executions, output_dir): io_utils.copy_dir(src_uri, dest_uri) logging.info('Pipeline Recorded at %s', output_dir)
def testCopyDir(self): old_path = os.path.join(self._base_dir, 'old', 'path') new_path = os.path.join(self._base_dir, 'new', 'path') io_utils.write_string_file(old_path, 'testing') io_utils.copy_dir(os.path.dirname(old_path), os.path.dirname(new_path)) self.assertTrue(file_io.file_exists(new_path)) f = file_io.FileIO(new_path, mode='r') self.assertEqual('testing', f.read()) self.assertEqual(7, f.tell())
def setUp(self): super(CliAirflowEndToEndTest, self).setUp() # List of packages installed. self._pip_list = str(subprocess.check_output(['pip', 'freeze', '--local'])) # Check if Apache Airflow is installed before running E2E tests. if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list: sys.exit('Apache Airflow not installed.') # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName, 'airflow') self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Do not load examples to make this a bit faster. os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False' # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = tf.io.gfile.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert tf.io.gfile.isdir(target_data_dir) content = tf.io.gfile.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) self._airflow_initdb() # Initialize CLI runner. self.runner = click_testing.CliRunner()
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Push model to target directory if blessed. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: A dict of execution properties, including: - push_destination: JSON string of pusher_pb2.PushDestination instance, providing instruction of destination to push model. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[PUSHED_MODEL_KEY]) model_push_uri = model_push.uri model_export = artifact_utils.get_single_instance( input_dict[MODEL_KEY]) model_export_uri = model_export.uri logging.info('Model pushing.') # Copy the model to pushing uri. model_path = path_utils.serving_model_path(model_export_uri) model_version = str(int(time.time())) # model_version = path_utils.get_serving_model_version(model_export_uri) logging.info('Model version is %s', model_version) io_utils.copy_dir(model_path, os.path.join(model_push_uri, model_version)) logging.info('Model written to %s.', model_push_uri) push_destination = pusher_pb2.PushDestination() json_format.Parse(exec_properties['push_destination'], push_destination) serving_path = os.path.join(push_destination.filesystem.base_directory, model_version) if tf.io.gfile.exists(serving_path): logging.info( 'Destination directory %s already exists, skipping current push.', serving_path) else: # tf.serving won't load partial model, it will retry until fully copied. io_utils.copy_dir(model_path, serving_path) logging.info('Model written to serving path %s.', serving_path) model_push.set_int_custom_property('pushed', 1) model_push.set_string_custom_property('pushed_model', model_export_uri) model_push.set_int_custom_property('pushed_model_id', model_export.id) logging.info('Model pushed to %s.', serving_path)
def setUp(self): super().setUp() # List of packages installed. self._pip_list = pip_utils.get_package_names() # Check if Apache Airflow is installed before running E2E tests. if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list: sys.exit('Apache Airflow not installed.') # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join(self.tmp_dir, 'airflow') self.enter_context( test_case_utils.override_env_var('AIRFLOW_HOME', self._airflow_home)) self.enter_context( test_case_utils.override_env_var('HOME', self._airflow_home)) absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') self._pipeline_name = 'chicago_taxi_simple' self._pipeline_path = os.path.join(self._testdata_dir, 'test_pipeline_airflow_1.py') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def setUp(self): super(CliAirflowEndToEndTest, self).setUp() # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home tf.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = tf.gfile.ListDirectory(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert tf.gfile.IsDirectory(target_data_dir) content = tf.gfile.ListDirectory(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize database. _ = subprocess.check_output(['airflow', 'initdb']) # Initialize CLI runner. self.runner = click_testing.CliRunner() # Start scheduler. self._scheduler = subprocess.Popen(['airflow', 'scheduler'])
def setUpClass(cls): super(ExecutorTest, cls).setUpClass() source_example_dir = os.path.join(cls._SOURCE_DATA_DIR, 'csv_example_gen') io_utils.copy_dir(source_example_dir, cls._ARTIFACT1_URI) io_utils.copy_dir(source_example_dir, cls._ARTIFACT2_URI) # Duplicate the number of train and eval records such that # second artifact has twice as many as first. artifact2_pattern = os.path.join(cls._ARTIFACT2_URI, '*', '*') artifact2_files = tf.io.gfile.glob(artifact2_pattern) for filepath in artifact2_files: directory, filename = os.path.split(filepath) io_utils.copy_file(filepath, os.path.join(directory, 'dup_' + filename))
def testCopyDirWithTrailingSlashes(self): old_path1 = os.path.join(self._base_dir, 'old1', '') old_path_file1 = os.path.join(old_path1, 'child', 'file') new_path1 = os.path.join(self._base_dir, 'new1') new_path_file1 = os.path.join(new_path1, 'child', 'file') io_utils.write_string_file(old_path_file1, 'testing') io_utils.copy_dir(old_path1, new_path1) self.assertTrue(file_io.file_exists(new_path_file1)) old_path2 = os.path.join(self._base_dir, 'old2') old_path_file2 = os.path.join(old_path2, 'child', 'file') new_path2 = os.path.join(self._base_dir, 'new2', '') new_path_file2 = os.path.join(new_path2, 'child', 'file') io_utils.write_string_file(old_path_file2, 'testing') io_utils.copy_dir(old_path2, new_path2) self.assertTrue(file_io.file_exists(new_path_file2))
def testCopyDir(self): old_path = os.path.join(self._base_dir, 'old') old_path_file1 = os.path.join(old_path, 'file1') old_path_file2 = os.path.join(old_path, 'dir', 'dir2', 'file2') new_path = os.path.join(self._base_dir, 'new') new_path_file1 = os.path.join(new_path, 'file1') new_path_file2 = os.path.join(new_path, 'dir', 'dir2', 'file2') io_utils.write_string_file(old_path_file1, 'testing') io_utils.write_string_file(old_path_file2, 'testing2') io_utils.copy_dir(old_path, new_path) self.assertTrue(file_io.file_exists(new_path_file1)) f = file_io.FileIO(new_path_file1, mode='r') self.assertEqual('testing', f.readline()) self.assertTrue(file_io.file_exists(new_path_file2)) f = file_io.FileIO(new_path_file2, mode='r') self.assertEqual('testing2', f.readline())
def setUp(self): super(CliBeamEndToEndTest, self).setUp() # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup beam_home in a temp directory self._home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._home self._old_beam_home = os.environ.get('BEAM_HOME') os.environ['BEAM_HOME'] = os.path.join(self._home, 'beam', '') self._beam_home = os.environ['BEAM_HOME'] # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = tf.io.gfile.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert tf.io.gfile.isdir(target_data_dir) content = tf.io.gfile.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._home, 'taxi', 'taxi_utils.py')) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def run_fn(fn_args: executor.TrainerFnArgs): """Train the model based on given args. Args: fn_args: Holds args used to train the model as name/value pairs. """ schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) training_spec = _trainer_fn(fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', fn_args.serving_model_dir) # Export an eval savedmodel for TFMA # NOTE: When trained in distributed training cluster, eval_savedmodel must be # exported only by the chief worker (check TF_CONFIG). absl.logging.info('Exporting eval_savedmodel for TFMA.') eval_export_dir = path_utils.eval_model_dir(fn_args.model_run_dir) tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_export_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir) # TODO(b/160795287): Deprecate estimator based executor. # Copy serving and eval model from model_run to model artifact directory. serving_source = path_utils.serving_model_path(fn_args.model_run_dir) io_utils.copy_dir(serving_source, fn_args.serving_model_dir) absl.logging.info('Serving model copied to: %s.', fn_args.serving_model_dir) eval_source = path_utils.eval_model_path(fn_args.model_run_dir) io_utils.copy_dir(eval_source, fn_args.eval_model_dir) absl.logging.info('Eval model copied to: %s.', fn_args.eval_model_dir)
def testCopyDir(self): self.createFiles({ 'old': { 'file1.txt': 'testing', 'dir1': { 'dir2': { 'file2.txt': 'testing2' } } } }) io_utils.copy_dir(self.relpath('old'), self.relpath('new')) self.assertDirectoryEqual(self.relpath('new'), { 'file1.txt': 'testing', 'dir1': { 'dir2': { 'file2.txt': 'testing2' } } })
def setUp(self): super().setUp() # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup beam_home in a temp directory self._home = self.tmp_dir self._beam_home = os.path.join(self._home, 'beam') self.enter_context( test_case_utils.override_env_var('BEAM_HOME', self._beam_home)) self.enter_context(test_case_utils.override_env_var( 'HOME', self._home)) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._home, 'taxi', 'taxi_utils.py')) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def setUp(self): super(PenguinPipelineLocalEndToEndTest, self).setUp() self._test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self._pipeline_name = 'penguin_test' self._data_root = os.path.join(os.path.dirname(__file__), 'data') # Create a data root for rolling window test # - data # - day1 # - penguins_processed.csv # - day2 # - penguins_processed.csv # - day3 # - penguins_processed.csv self._data_root_span = os.path.join(self._test_dir, 'data') io_utils.copy_dir(self._data_root, os.path.join(self._data_root_span, 'day1')) io_utils.copy_dir(self._data_root, os.path.join(self._data_root_span, 'day2')) io_utils.copy_dir(self._data_root, os.path.join(self._data_root_span, 'day3')) self._module_file = os.path.join(os.path.dirname(__file__), 'penguin_utils.py') self._serving_model_dir = os.path.join(self._test_dir, 'serving_model') self._pipeline_root = os.path.join(self._test_dir, 'tfx', 'pipelines', self._pipeline_name) self._metadata_path = os.path.join(self._test_dir, 'tfx', 'metadata', self._pipeline_name, 'metadata.db')
def setUp(self): super().setUp() self._test_dir = self.tmp_dir self.enter_context(test_case_utils.change_working_dir(self.tmp_dir)) self._test_output_dir = 'gs://{}/test_output'.format(self._BUCKET_NAME) test_id = test_utils.random_id() self._testdata_root = 'gs://{}/test_data/{}'.format( self._BUCKET_NAME, test_id) io_utils.copy_dir(self._TEST_DATA_ROOT, self._testdata_root) self._data_root = os.path.join(self._testdata_root, 'external', 'csv') self._transform_module = os.path.join(self._MODULE_ROOT, 'transform_module.py') self._trainer_module = os.path.join(self._MODULE_ROOT, 'trainer_module.py') self._serving_model_dir = os.path.join(self._testdata_root, 'output') self.addCleanup(self._delete_test_dir, test_id)
def setUp(self): super().setUp() penguin_examples_dir = os.path.join(self._REPO_BASE, 'tfx', 'examples', 'penguin') # The location of the penguin test data and schema. The input files are # copied to a test-local location for each invocation, and cleaned up at the # end of test. penguin_test_data_root = os.path.join(penguin_examples_dir, 'data') penguin_test_schema_file = os.path.join(penguin_examples_dir, 'schema', 'user_provided', 'schema.pbtxt') # The location of the user module for penguin. Will be packaged and copied # to under the pipeline root before pipeline execution. self._penguin_dependency_file = os.path.join( penguin_examples_dir, 'penguin_utils_cloud_tuner.py') self._penguin_data_root = os.path.join(self._testdata_root, 'data') io_utils.copy_dir(penguin_test_data_root, self._penguin_data_root) self._penguin_schema_file = os.path.join(self._testdata_root, 'schema.pbtxt') io_utils.copy_file(penguin_test_schema_file, self._penguin_schema_file)
def testCopyDirWithTrailingSlashes(self): self.createFiles({ 'old': { 'dir': { 'file.txt': 'testing' } } }) with self.subTest('Copy old/ to new1'): io_utils.copy_dir(self.relpath('old', ''), self.relpath('new1')) self.assertDirectoryEqual(self.relpath('new1'), { 'dir': { 'file.txt': 'testing' } }) with self.subTest('Copy old to new2/'): io_utils.copy_dir(self.relpath('old'), self.relpath('new2', '')) self.assertDirectoryEqual(self.relpath('new2'), { 'dir': { 'file.txt': 'testing' } })
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Push model to target directory if blessed. Args: input_dict: Input dict from input key to a list of artifacts, including: - model: exported model from trainer. - model_blessing: model blessing path from model_validator. A push action delivers the model exports produced by Trainer to the destination defined in component config. output_dict: Output dict from key to a list of artifacts, including: - pushed_model: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: A dict of execution properties, including: - push_destination: JSON string of pusher_pb2.PushDestination instance, providing instruction of destination to push model. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[standard_component_specs.PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): self._MarkNotPushed(model_push) return model_export = artifact_utils.get_single_instance( input_dict[standard_component_specs.MODEL_KEY]) model_path = path_utils.serving_model_path( model_export.uri, path_utils.is_old_model_artifact(model_export)) # Push model to the destination, which can be listened by a model server. # # If model is already successfully copied to outside before, stop copying. # This is because model validator might blessed same model twice (check # mv driver) with different blessing output, we still want Pusher to # handle the mv output again to keep metadata tracking, but no need to # copy to outside path again.. # TODO(jyzhao): support rpc push and verification. push_destination = pusher_pb2.PushDestination() proto_utils.json_to_proto( exec_properties[standard_component_specs.PUSH_DESTINATION_KEY], push_destination) destination_kind = push_destination.WhichOneof('destination') if destination_kind == 'filesystem': fs_config = push_destination.filesystem if fs_config.versioning == _Versioning.AUTO: fs_config.versioning = _Versioning.UNIX_TIMESTAMP if fs_config.versioning == _Versioning.UNIX_TIMESTAMP: model_version = str(int(time.time())) else: raise NotImplementedError('Invalid Versioning {}'.format( fs_config.versioning)) logging.info('Model version: %s', model_version) serving_path = os.path.join(fs_config.base_directory, model_version) if fileio.exists(serving_path): logging.info( 'Destination directory %s already exists, skipping current push.', serving_path) else: # tf.serving won't load partial model, it will retry until fully copied. io_utils.copy_dir(model_path, serving_path) logging.info('Model written to serving path %s.', serving_path) else: raise NotImplementedError( 'Invalid push destination {}'.format(destination_kind)) # Copy the model to pushing uri for archiving. io_utils.copy_dir(model_path, model_push.uri) self._MarkPushed(model_push, pushed_destination=serving_path, pushed_version=model_version) logging.info('Model pushed to %s.', model_push.uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. custom_config.bigquery_serving_args is consumed by this class. For the full set of parameters supported by Big Query ML, refer to https://cloud.google.com/bigquery-ml/ Returns: None Raises: ValueError: If bigquery_serving_args is not in exec_properties.custom_config. If pipeline_root is not 'gs://...' RuntimeError: if the Big Query job failed. """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[tfx_pusher_executor.PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): self._MarkNotPushed(model_push) return model_export = artifact_utils.get_single_instance( input_dict[tfx_pusher_executor.MODEL_KEY]) model_export_uri = model_export.uri custom_config = json_utils.loads( exec_properties.get(_CUSTOM_CONFIG_KEY, 'null')) if custom_config is not None and not isinstance(custom_config, Dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict.') bigquery_serving_args = custom_config.get(SERVING_ARGS_KEY) # if configuration is missing error out if bigquery_serving_args is None: raise ValueError('Big Query ML configuration was not provided') bq_model_uri = '.'.join([ bigquery_serving_args[_PROJECT_ID_KEY], bigquery_serving_args[_BQ_DATASET_ID_KEY], bigquery_serving_args[_MODEL_NAME_KEY], ]) # Deploy the model. io_utils.copy_dir( src=path_utils.serving_model_path(model_export_uri), dst=model_push.uri) model_path = model_push.uri if not model_path.startswith(_GCS_PREFIX): raise ValueError('pipeline_root must be gs:// for BigQuery ML Pusher.') logging.info('Deploying the model to BigQuery ML for serving: %s from %s', bigquery_serving_args, model_path) query = _BQML_CREATE_OR_REPLACE_MODEL_QUERY_TEMPLATE.format( model_uri=bq_model_uri, model_path=model_path) # TODO(zhitaoli): Refactor the executor_class_path creation into a common # utility function. executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): default_query_job_config = bigquery.job.QueryJobConfig( labels=telemetry_utils.get_labels_dict()) client = bigquery.Client(default_query_job_config=default_query_job_config) try: query_job = client.query(query) query_job.result() # Waits for the query to finish except Exception as e: raise RuntimeError('BigQuery ML Push failed: {}'.format(e)) logging.info('Successfully deployed model %s serving from %s', bq_model_uri, model_path) # Setting the push_destination to bigquery uri self._MarkPushed(model_push, pushed_destination=bq_model_uri)
def Do(self, input_dict, output_dict, exec_properties): """Push model to target if blessed. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: A dict of execution properties, including: - push_destination: JSON string of pusher_pb2.PushDestination instance, providing instruction of destination to push model. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) model_export = types.get_single_instance(input_dict['model_export']) model_export_uri = model_export.uri model_blessing_uri = types.get_single_uri(input_dict['model_blessing']) model_push = types.get_single_instance(output_dict['model_push']) model_push_uri = model_push.uri # TODO(jyzhao): should this be in driver or executor. if not tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')): model_push.set_int_custom_property('pushed', 0) tf.logging.info('Model on %s was not blessed', ) return tf.logging.info('Model pushing.') # Copy the model we are pushing into model_path = path_utils.serving_model_path(model_export_uri) # Note: we do not have a logical model version right now. This # model_version is a timestamp mapped to trainer's exporter. model_version = os.path.basename(model_path) tf.logging.info('Model version is %s', model_version) io_utils.copy_dir(model_path, os.path.join(model_push_uri, model_version)) tf.logging.info('Model written to %s.', model_push_uri) # Copied to a fixed outside path, which can be listened by model server. # # If model is already successfully copied to outside before, stop copying. # This is because model validator might blessed same model twice (check # mv driver) with different blessing output, we still want Pusher to # handle the mv output again to keep metadata tracking, but no need to # copy to outside path again.. # TODO(jyzhao): support rpc push and verification. push_destination = pusher_pb2.PushDestination() json_format.Parse(exec_properties['push_destination'], push_destination) serving_path = os.path.join(push_destination.filesystem.base_directory, model_version) if tf.gfile.Exists(serving_path): tf.logging.info( 'Destination directory %s already exists, skipping current push.', serving_path) else: # tf.serving won't load partial model, it will retry until fully copied. io_utils.copy_dir(model_path, serving_path) tf.logging.info('Model written to serving path %s.', serving_path) model_push.set_int_custom_property('pushed', 1) model_push.set_string_custom_property('pushed_model', model_export_uri) model_push.set_int_custom_property('pushed_model_id', model_export.id) tf.logging.info('Model pushed to %s.', serving_path) if exec_properties.get('custom_config'): cmle_serving_args = exec_properties.get( 'custom_config', {}).get('cmle_serving_args') if cmle_serving_args is not None: return cmle_runner.deploy_model_for_serving( serving_path, model_version, cmle_serving_args, exec_properties['log_root'])
def Do(self, input_dict, output_dict, exec_properties): """Push model to target if blessed. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: A dict of execution properties, including: - push_destination: JSON string of pusher_pb2.PushDestination instance, providing instruction of destination to push model. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) model_export = types.get_single_instance(input_dict['model_export']) model_export_uri = model_export.uri model_blessing_uri = types.get_single_uri(input_dict['model_blessing']) model_push = types.get_single_instance(output_dict['model_push']) model_push_uri = model_push.uri # TODO(jyzhao): should this be in driver or executor. if not tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')): model_push.set_int_custom_property('pushed', 0) tf.logging.info('Model on %s was not blessed',) return tf.logging.info('Model pushing.') # Copy the model we are pushing into model_path = path_utils.serving_model_path(model_export_uri) # Note: we do not have a logical model version right now. This # model_version is a timestamp mapped to trainer's exporter. model_version = os.path.basename(model_path) tf.logging.info('Model version is %s', model_version) io_utils.copy_dir(model_path, os.path.join(model_push_uri, model_version)) tf.logging.info('Model written to %s.', model_push_uri) # Copied to a fixed outside path, which can be listened by model server. # # If model is already successfully copied to outside before, stop copying. # This is because model validator might blessed same model twice (check # mv driver) with different blessing output, we still want Pusher to # handle the mv output again to keep metadata tracking, but no need to # copy to outside path again.. # TODO(jyzhao): support rpc push and verification. push_destination = pusher_pb2.PushDestination() json_format.Parse(exec_properties['push_destination'], push_destination) serving_path = os.path.join(push_destination.filesystem.base_directory, model_version) if tf.gfile.Exists(serving_path): tf.logging.info( 'Destination directory %s already exists, skipping current push.', serving_path) else: # tf.serving won't load partial model, it will retry until fully copied. io_utils.copy_dir(model_path, serving_path) tf.logging.info('Model written to serving path %s.', serving_path) model_push.set_int_custom_property('pushed', 1) model_push.set_string_custom_property('pushed_model', model_export_uri) model_push.set_int_custom_property('pushed_model_id', model_export.id) tf.logging.info('Model pushed to %s.', serving_path) if exec_properties.get('custom_config'): cmle_serving_args = exec_properties.get('custom_config', {}).get('cmle_serving_args') if cmle_serving_args is not None: return cmle_runner.deploy_model_for_serving(serving_path, model_version, cmle_serving_args, exec_properties['log_root'])