def testCustomStubExecutor(self, mock_publisher): # verify whether custom stub executor substitution works mock_publisher.return_value.publish_execution.return_value = {} component_map = \ {'_FakeComponent.FakeComponent': CustomStubExecutor} my_stub_launcher = \ stub_component_launcher.get_stub_launcher_class( test_data_dir=self.record_dir, stubbed_component_ids=[], stubbed_component_map=component_map) launcher = my_stub_launcher.create( component=self.component, pipeline_info=self.pipeline_info, driver_args=self.driver_args, metadata_connection=self.metadata_connection, beam_pipeline_args=[], additional_pipeline_args={}) launcher.launch() output_path = self.component.outputs['output'].get()[0].uri generated_file = os.path.join(output_path, 'result.txt') self.assertTrue(tf.io.gfile.exists(generated_file)) contents = io_utils.read_string_file(generated_file) self.assertEqual('custom component', contents)
def testStubExecutor(self, mock_publisher): # verify whether base stub executor substitution works mock_publisher.return_value.publish_execution.return_value = {} record_file = os.path.join(self.record_dir, 'output', 'recorded.txt') io_utils.write_string_file(record_file, 'hello world') component_ids = ['_FakeComponent.FakeComponent'] my_stub_launcher = \ stub_component_launcher.get_stub_launcher_class( test_data_dir=self.record_dir, stubbed_component_ids=component_ids, stubbed_component_map={}) launcher = my_stub_launcher.create( component=self.component, pipeline_info=self.pipeline_info, driver_args=self.driver_args, metadata_connection=self.metadata_connection, beam_pipeline_args=[], additional_pipeline_args={}) launcher.launch() output_path = self.component.outputs['output'].get()[0].uri copied_file = os.path.join(output_path, 'recorded.txt') self.assertTrue(tf.io.gfile.exists(copied_file)) contents = io_utils.read_string_file(copied_file) self.assertEqual('hello world', contents)
def testRecordBeamPipelineRunId(self, mock_metadata, mock_config): # Tests recording Beam pipeline outputs given a run_id. with mock.patch.object(pipeline_recorder_utils, '_get_execution_dict', return_value=self.execution_dict ) as mock_get_execution_dict,\ mock.patch.object(pipeline_recorder_utils, '_get_paths', return_value=self.paths ) as mock_get_paths: pipeline_recorder_utils.record_pipeline( output_dir=self._base_dir, metadata_db_uri=self.metadata_db_uri, run_id=self.run_id) mock_config.assert_called_with(self.metadata_db_uri) mock_metadata.assert_called() mock_get_execution_dict.assert_called() mock_get_paths.assert_called() # Verifying that test.txt has been copied from src_uri to dest_uri files = fileio.listdir(self.dest_uri) self.assertLen(files, 1) self.assertEqual( io_utils.read_string_file(os.path.join(self.dest_uri, files[0])), self.content)
def testExecutor(self, mock_publisher): # verify whether original executors can run mock_publisher.return_value.publish_execution.return_value = {} io_utils.write_string_file(os.path.join(self.input_dir, 'result.txt'), 'test') stub_component_launcher.StubComponentLauncher.initialize( test_data_dir=self.record_dir, test_component_ids=[self.component.id]) launcher = stub_component_launcher.StubComponentLauncher.create( component=self.component, pipeline_info=self.pipeline_info, driver_args=self.driver_args, metadata_connection=self.metadata_connection, beam_pipeline_args=[], additional_pipeline_args={}) self.assertEqual( launcher._component_info.component_type, # pylint: disable=protected-access '.'.join([ # pylint: disable=protected-access test_utils._FakeComponent.__module__, # pylint: disable=protected-access test_utils._FakeComponent.__name__ # pylint: disable=protected-access ])) launcher.launch() output_path = self.component.outputs[self.output_key].get()[0].uri self.assertTrue(fileio.exists(output_path)) contents = io_utils.read_string_file(output_path) self.assertEqual('test', contents)
def testRecordLatestBeamPipeline(self, mock_get_latest_executions, mock_metadata, mock_config): # Tests recording Beam pipeline outputs for the latest execution. with mock.patch.object(pipeline_recorder_utils, '_get_paths', return_value=self.paths) as mock_get_paths: pipeline_recorder_utils.record_pipeline( output_dir=self._base_dir, metadata_db_uri=self.metadata_db_uri, host=None, port=None, pipeline_name=self.pipeline_name, run_id=None) mock_config.assert_called_with(self.metadata_db_uri) mock_metadata.assert_called() mock_get_paths.assert_called() mock_get_latest_executions.assert_called() # Verifying that test.txt has been copied from src_uri to dest_uri files = tf.io.gfile.listdir(self.dest_uri) self.assertLen(files, 1) self.assertEqual( io_utils.read_string_file(os.path.join(self.dest_uri, files[0])), self.content)
def testStubExecutor(self, mock_publisher): # verify whether base stub executor substitution works mock_publisher.return_value.publish_execution.return_value = {} record_file = os.path.join(self.record_dir, self.component.id, self.output_key, '0', 'recorded.txt') io_utils.write_string_file(record_file, 'hello world') stub_component_launcher.StubComponentLauncher.initialize( test_data_dir=self.record_dir, test_component_ids=[]) launcher = stub_component_launcher.StubComponentLauncher.create( component=self.component, pipeline_info=self.pipeline_info, driver_args=self.driver_args, metadata_connection=self.metadata_connection, beam_pipeline_args=[], additional_pipeline_args={}) launcher.launch() output_path = self.component.outputs[self.output_key].get()[0].uri copied_file = os.path.join(output_path, 'recorded.txt') self.assertTrue(fileio.exists(copied_file)) contents = io_utils.read_string_file(copied_file) self.assertEqual('hello world', contents)
def copy_and_change_pipeline_name(orig_path: str, new_path: str, origin_pipeline_name: str, new_pipeline_name: str) -> None: """Copy pipeline file to new path with pipeline name changed.""" contents = io_utils.read_string_file(orig_path) assert contents.count(origin_pipeline_name ) == 1, 'DSL file can only contain one pipeline name' contents = contents.replace(origin_pipeline_name, new_pipeline_name) io_utils.write_string_file(new_path, contents)
def _verify_metafeature_gen_outputs(self): self.assertNotEmpty(tf.io.gfile.listdir(self._metafeatures.uri)) metafeature_path = os.path.join(self._metafeatures.uri, artifacts.MetaFeatures.DEFAULT_FILE_NAME) metafeature = json.loads(io_utils.read_string_file(metafeature_path)) self.assertEqual(metafeature['num_examples'], 3) self.assertEqual(metafeature['num_int_features'], 1) self.assertEqual(metafeature['num_float_features'], 1) self.assertEqual(metafeature['num_categorical_features'], 2)
def _verify_hparams_outputs(self, algorithm: str): path = os.path.join(self._hparams_out.uri, 'meta_hyperparameters.txt') self.assertTrue(tf.io.gfile.exists(path)) hparams_json_list = json.loads(io_utils.read_string_file(path)) if algorithm == executor.MAJORITY_VOTING: self._verify_hparams_values_majority_voting(hparams_json_list) elif algorithm == executor.NEAREST_NEIGHBOR: self._verify_hparams_values_nearest_neighbor(hparams_json_list)
def test_create_search_space_using_voting(self): metadata_indices = [1, 2, 3] all_hparams = [] for dataset_id in metadata_indices: hyperparameters_file = os.path.join(self._input_data_dir, f'Tuner.train_mockdata_{dataset_id}', 'best_hyperparameters', 'best_hyperparameters.txt') hparams_json = json.loads(io_utils.read_string_file(hyperparameters_file)) all_hparams.append(hparams_json['values']) ex = executor.MetaLearnerExecutor() search_space = ex._create_search_space_using_voting(all_hparams) self._verify_hparams_values_majority_voting([search_space.get_config()])
def test_create_knn_model_from_metafeatures(self): metadata_indices = [1, 2, 3] metafeatures_list = [] for dataset_id in metadata_indices: metafeature_uri = os.path.join( self._input_data_dir, f'MetaFeatureGen.train_mockdata_{dataset_id}', 'metafeatures', 'metafeatures.txt') metafeatures = json.loads(io_utils.read_string_file(metafeature_uri)) metafeatures_list.append(metafeatures['metafeature']) ex = executor.MetaLearnerExecutor() model = ex._create_knn_model_from_metafeatures(metafeatures_list) model_weights = model.get_layer('metafeatures').get_weights() self._verify_model_weights(model_weights[0])
def testExitHandlerPipelineSuccess(self): """End-to-End test for a successful pipeline with exit handler.""" pipeline_name = 'kubeflow-v2-exit-handler-test-{}'.format( orchestration_test_utils.random_id()) components = test_utils.simple_pipeline_components(_TEST_DATA_ROOT) beam_pipeline_args = [ '--temp_location=' + os.path.join( self._pipeline_root(pipeline_name), 'dataflow', 'temp'), '--project={}'.format(self._GCP_PROJECT_ID) ] pipeline = self._create_pipeline(pipeline_name, components, beam_pipeline_args) output_file_dir = os.path.join(self._pipeline_root(pipeline_name), _success_file_name) exit_handler = custom_exit_handler.test_exit_handler( final_status=tfx.orchestration.experimental.FinalStatusStr(), file_dir=output_file_dir) self._run_pipeline(pipeline=pipeline, exit_handler=exit_handler) # verify execution results actual_final_status_str = io_utils.read_string_file(output_file_dir) expected_successful_final_status_str = """ { "state":"SUCCEEDED", "error":{} } """ expected_successful_final_status = ( pipeline_spec_pb2.PipelineTaskFinalStatus()) json_format.Parse(expected_successful_final_status_str, expected_successful_final_status) actual_final_status = pipeline_spec_pb2.PipelineTaskFinalStatus() json_format.Parse(actual_final_status_str, actual_final_status) self.assertProtoPartiallyEquals( expected_successful_final_status, actual_final_status, ignored_fields=['pipeline_job_resource_name'])
def testRecordLatestKfpPipeline(self, mock_get_latest_executions): # Tests recording KFP pipeline outputs for the latest execution. with mock.patch.object( pipeline_recorder_utils, '_get_paths', return_value=self.paths) as mock_get_paths: pipeline_recorder_utils.record_pipeline( output_dir=self._base_dir, host=self.host, port=self.port, pipeline_name=self.pipeline_name) mock_get_paths.assert_called() mock_get_latest_executions.assert_called() files = fileio.listdir(self.dest_uri) self.assertLen(files, 1) self.assertEqual( io_utils.read_string_file(os.path.join(self.dest_uri, files[0])), self.content)
def testRecordKfpPipelineRunId(self): # Tests recording KFP pipeline outputs given a run_id. with mock.patch.object(pipeline_recorder_utils, '_get_execution_dict', return_value=self.execution_dict ) as mock_get_execution_dict,\ mock.patch.object(pipeline_recorder_utils, '_get_paths', return_value=self.paths) as mock_get_paths: pipeline_recorder_utils.record_pipeline(output_dir=self._base_dir, host=self.host, port=self.port, run_id=self.run_id) mock_get_execution_dict.assert_called() mock_get_paths.assert_called() # Verifying that test.txt has been copied from src_uri to dest_uri files = tf.io.gfile.listdir(self.dest_uri) self.assertLen(files, 1) self.assertEqual( io_utils.read_string_file(os.path.join(self.dest_uri, files[0])), self.content)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Transform a model with the provided function. ... Args: input_dict: Input dict from input key to a list of artifacts, including: - input_model: A list of type `standard_artifacts.Model` - pipeline_configuration: optional PipelineConfiguration artifact. output_dict: Output dict from key to a list of artifacts, including: - output_model: A list of type `standard_artifacts.Model` exec_properties: A dict of execution properties, including: - function_name: The name of the function to apply on the model - identity function is used if not specified. - instance_name: Optional unique instance_name. Necessary iff multiple Hello components are declared in the same pipeline. Returns: None Raises: OSError and its subclasses ValueError """ self._log_startup(input_dict, output_dict, exec_properties) input_model = artifact_utils.get_single_instance( input_dict[INPUT_MODEL_KEY]) output_model = artifact_utils.get_single_instance( output_dict[OUTPUT_MODEL_KEY]) function_name = exec_properties.get( FUNCTION_NAME_KEY, 'tfx_x.components.model.transform.executor.identity') pipeline_configuration = {} if PIPELINE_CONFIGURATION_KEY in input_dict: pipeline_configuration_dir = artifact_utils.get_single_uri( input_dict[PIPELINE_CONFIGURATION_KEY]) pipeline_configuration_file = os.path.join( pipeline_configuration_dir, 'custom_config.json') pipeline_configuration_str = io_utils.read_string_file( pipeline_configuration_file) pipeline_configuration = json.loads(pipeline_configuration_str) # check if function_name can be found function_name_split = function_name.split('.') module_name = '.'.join(function_name_split[0:-1]) module = importlib.import_module(module_name) fn = getattr(module, function_name_split[-1]) if fn is None: raise ValueError('`function_name` not found') input_dir = artifact_utils.get_single_uri([input_model]) output_dir = artifact_utils.get_single_uri([output_model]) # load the model model = tf.keras.models.load_model( os.path.join(input_dir, 'Format-Serving')) # transform new_model, signatures, options = fn(model, pipeline_configuration) # save the model tf.saved_model.save(model, os.path.join(output_dir, 'Format-Serving'), signatures, options)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Export a model with the provided function. ... Args: input_dict: Input dict from input key to a list of artifacts, including: - model: A list of type `standard_artifacts.Model` - pipeline_configuration: optional PipelineConfiguration artifact. - model_blessing: optional model blessing artifact. - infra_blessing: optional infra blessing artifact. - pushed_model: optional pushed model artifact. - transform_graph: optional transform graph artifact. output_dict: Output dict from key to a list of artifacts, including: - output: model export artifact. exec_properties: A dict of execution properties, including: - function_name: The name of the function to apply on the model - noop function is used if not specified. - instance_name: Optional unique instance_name. Necessary iff multiple Hello components are declared in the same pipeline. Returns: None Raises: OSError and its subclasses ValueError """ self._log_startup(input_dict, output_dict, exec_properties) if not self.CheckBlessing(input_dict): return model = artifact_utils.get_single_instance( input_dict[MODEL_KEY]) output = artifact_utils.get_single_instance( output_dict[OUTPUT_KEY]) model_push_artifact = None if standard_component_specs.PUSHED_MODEL_KEY in input_dict: model_push_artifact = artifact_utils.get_single_instance( input_dict[standard_component_specs.PUSHED_MODEL_KEY]) transform_graph_artifact = None if standard_component_specs.TRANSFORM_GRAPH_KEY in input_dict: transform_graph_artifact = artifact_utils.get_single_instance( input_dict[standard_component_specs.TRANSFORM_GRAPH_KEY]) function_name = exec_properties.get(FUNCTION_NAME_KEY, 'tfx_x.components.model.export.executor.noop') pipeline_configuration = {} if PIPELINE_CONFIGURATION_KEY in input_dict: pipeline_configuration_dir = artifact_utils.get_single_uri(input_dict[PIPELINE_CONFIGURATION_KEY]) pipeline_configuration_file = os.path.join(pipeline_configuration_dir, 'custom_config.json') pipeline_configuration_str = io_utils.read_string_file(pipeline_configuration_file) pipeline_configuration = json.loads(pipeline_configuration_str) # check if function_name can be found function_name_split = function_name.split('.') module_name = '.'.join(function_name_split[0:-1]) module = importlib.import_module(module_name) fn = getattr(module, function_name_split[-1]) if fn is None: raise ValueError('`function_name` not found') input_dir = artifact_utils.get_single_uri([model]) output_dir = artifact_utils.get_single_uri([output]) model_push_dir = None if model_push_artifact is not None: model_push_dir = artifact_utils.get_single_uri([model_push_artifact]) # load the model model = tf.keras.models.load_model(os.path.join(input_dir, 'Format-Serving')) # export fn(model, pipeline_configuration, output_dir, model_push_dir, model_push_artifact, transform_graph_artifact)
def testOverrideRegisterExecution(self): # Mock all real operations of driver / executor / MLMD accesses. mock_targets = ( # (cls, method, return_value) (beam_executor_operator.BeamExecutorOperator, '__init__', None), (beam_executor_operator.BeamExecutorOperator, 'run_executor', execution_result_pb2.ExecutorOutput()), (python_driver_operator.PythonDriverOperator, '__init__', None), (python_driver_operator.PythonDriverOperator, 'run_driver', driver_output_pb2.DriverOutput()), (metadata.Metadata, '__init__', None), (metadata.Metadata, '__exit__', None), (launcher.Launcher, '_publish_successful_execution', None), (launcher.Launcher, '_clean_up_stateless_execution_info', None), (launcher.Launcher, '_clean_up_stateful_execution_info', None), (outputs_utils, 'OutputsResolver', mock.MagicMock()), (execution_lib, 'get_executions_associated_with_all_contexts', []), (container_entrypoint, '_dump_ui_metadata', None), ) for cls, method, return_value in mock_targets: self.enter_context( mock.patch.object(cls, method, autospec=True, return_value=return_value)) mock_mlmd = self.enter_context( mock.patch.object(metadata.Metadata, '__enter__', autospec=True)).return_value mock_mlmd.store.return_value.get_executions_by_id.return_value = [ metadata_store_pb2.Execution() ] self._set_required_env_vars({ 'WORKFLOW_ID': 'workflow-id-42', 'METADATA_GRPC_SERVICE_HOST': 'metadata-grpc', 'METADATA_GRPC_SERVICE_PORT': '8080', container_entrypoint._KFP_POD_NAME_ENV_KEY: 'test_pod_name' }) mock_register_execution = self.enter_context( mock.patch.object(execution_publish_utils, 'register_execution', autospec=True)) test_ir_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'testdata', 'two_step_pipeline_post_dehydrate_ir.json') test_ir = io_utils.read_string_file(test_ir_file) argv = [ '--pipeline_root', 'dummy', '--kubeflow_metadata_config', json_format.MessageToJson( kubeflow_dag_runner.get_default_kubeflow_metadata_config()), '--tfx_ir', test_ir, '--node_id', 'BigQueryExampleGen', '--runtime_parameter', 'pipeline-run-id=STRING:my-run-id', ] container_entrypoint.main(argv) mock_register_execution.assert_called_once() kwargs = mock_register_execution.call_args[1] self.assertEqual( kwargs['exec_properties'] [container_entrypoint._KFP_POD_NAME_PROPERTY_KEY], 'test_pod_name')
def testReadWriteString(self): file_path = os.path.join(self._base_dir, 'test_file') content = 'testing read/write' io_utils.write_string_file(file_path, content) read_content = io_utils.read_string_file(file_path) self.assertEqual(content, read_content)
def warmup(self, input_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, List[types.Artifact]], algorithm: str): # Perform warmup tuning if WARMUP_HYPERPARAMETERS given. hparams_warmup_config_list = None if input_dict.get(WARMUP_HYPERPARAMETERS): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[WARMUP_HYPERPARAMETERS])) hparams_warmup_config_list = json.loads( io_utils.read_string_file(hyperparameters_file)) fn_args = fn_args_utils.get_common_fn_args( input_dict, exec_properties, working_dir=self._get_tmp_dir() + 'warmup') # TODO(nikhilmehta): Currently all algorithms need warmup_hyperparameters. # This may not be needed for other algorithms that can predict hyperparams. if not hparams_warmup_config_list: raise ValueError('Expected warmup_hyperparameters') logging.info('Algorithm: %s', algorithm) warmup_trials = 0 if algorithm == 'majority_voting': warmup_trials = DEFAULT_WARMUP_TRIALS fn_args.custom_config[ WARMUP_HYPERPARAMETERS] = hparams_warmup_config_list[0] elif algorithm == 'nearest_neighbor': warmup_trials = DEFAULT_WARMUP_TRIALS if input_dict.get('metamodel'): metamodel_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['metamodel'])) logging.info('Meta model path: %s', metamodel_path) metamodel = _load_keras_model(metamodel_path) else: raise ValueError( f'Tuner for metalearning_algorithm={algorithm} expects metamodel.' ) if input_dict.get('metafeature'): metafeature_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['metafeature'])) logging.info('Metafeature: %s', metafeature_path) metafeature = json.loads( io_utils.read_string_file(metafeature_path)) metafeature = metafeature['metafeature'] else: raise ValueError( f'Tuner for metalearning_algorithm={algorithm} expects metafeature.' ) metafeature = np.array(metafeature, dtype=np.float32) metafeature = np.expand_dims(metafeature, axis=0) logits = metamodel(metafeature).numpy()[0] nearest_configs = [ hparams_warmup_config_list[ix] for ix in np.argsort(logits)[-DEFAULT_K:] ] nearest_hparam_config = _merge_hparam_configs(nearest_configs) fn_args.custom_config[ WARMUP_HYPERPARAMETERS] = nearest_hparam_config else: raise NotImplementedError( f'Tuning for metalearning_algorithm={algorithm} is not implemented.' ) # kerastuner doesn't support grid search, setting max_trials large enough. # Track issue: https://github.com/keras-team/keras-tuner/issues/340 fn_args.custom_config['max_trials'] = warmup_trials tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn') warmtuner_fn_result = tuner_fn(fn_args) warmup_tuner = self.search(warmtuner_fn_result) return warmup_tuner, warmup_trials
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs stratified sampling on given input examples. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: examples for inference. - pipeline_configuration: optional PipelineConfiguration artifact. output_dict: Output dict from output key to a list of Artifacts. - stratified_examples: the stratified examples. exec_properties: A dict of execution properties. - splits_to_transform: list of splits to transform. - splits_to_copy: list of splits to copy as is. - to_key_fn: the function that will extract the key - must be 'to_key: Example -> key - to_key_fn_key: alternate name for the key containing the def of `to_key()` - samples_per_key: the number samples per classes Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) examples = input_dict[EXAMPLES_KEY] # Priority is as follow: # 1. default value # 2. from PipelineConfiguration # 3. from exec_properties splits_to_transform = [] samples_per_key = None to_key_fn = None to_key_fn_key = exec_properties[ TO_KEY_FN_KEY_KEY] if TO_KEY_FN_KEY_KEY in exec_properties else TO_KEY_FN_KEY splits_to_copy = artifact_utils.decode_split_names( artifact_utils.get_single_instance(examples).split_names) if PIPELINE_CONFIGURATION_KEY in input_dict: pipeline_configuration_dir = artifact_utils.get_single_uri( input_dict[PIPELINE_CONFIGURATION_KEY]) pipeline_configuration_file = os.path.join( pipeline_configuration_dir, 'custom_config.json') pipeline_configuration_str = io_utils.read_string_file( pipeline_configuration_file) pipeline_configuration = json.loads(pipeline_configuration_str) if SPLITS_TO_TRANSFORM_KEY in pipeline_configuration: splits_to_transform = pipeline_configuration[ SPLITS_TO_TRANSFORM_KEY] else: splits_to_transform = [] if SPLITS_TO_COPY_KEY in pipeline_configuration: splits_to_copy = pipeline_configuration[SPLITS_TO_COPY_KEY] if to_key_fn_key in pipeline_configuration: to_key_fn = pipeline_configuration[to_key_fn_key] if SAMPLES_PER_KEY_KEY in pipeline_configuration: samples_per_key = pipeline_configuration[SAMPLES_PER_KEY_KEY] # Now looking at the exec_properties if SPLITS_TO_TRANSFORM_KEY in exec_properties and exec_properties[ SPLITS_TO_TRANSFORM_KEY] is not None: splits_to_transform = json_utils.loads( exec_properties[SPLITS_TO_TRANSFORM_KEY]) if SPLITS_TO_COPY_KEY in exec_properties and exec_properties[ SPLITS_TO_COPY_KEY] is not None: splits_to_copy = json_utils.loads( exec_properties[SPLITS_TO_COPY_KEY]) if TO_KEY_FN_KEY in exec_properties and exec_properties[ TO_KEY_FN_KEY] is not None: to_key_fn = exec_properties[TO_KEY_FN_KEY] if to_key_fn_key in exec_properties and exec_properties[ to_key_fn_key] is not None: to_key_fn = exec_properties[to_key_fn_key] if SAMPLES_PER_KEY_KEY in exec_properties and exec_properties[ SAMPLES_PER_KEY_KEY] is not None: samples_per_key = exec_properties[SAMPLES_PER_KEY_KEY] # Validate we have all we need if to_key_fn is None: raise ValueError('\'to_key_fn\' is missing in exec dict.') if samples_per_key is None: raise ValueError('\'samples_per_key\' is missing in exec dict.') if EXAMPLES_KEY not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if STRATIFIED_EXAMPLES_KEY not in output_dict: raise ValueError( '\'stratified_examples\' is missing in output dict.') output_artifact = artifact_utils.get_single_instance( output_dict[STRATIFIED_EXAMPLES_KEY]) output_artifact.split_names = artifact_utils.encode_split_names( splits_to_transform + splits_to_copy) example_uris = {} for split in splits_to_transform: data_uri = artifact_utils.get_split_uri(examples, split) example_uris[split] = data_uri # do something with the splits we dont want to transform ('splits_to_copy') utils.copy_over(examples, output_artifact, splits_to_copy) self._run_sampling(example_uris, output_artifact=output_artifact, samples_per_key=samples_per_key, to_key_fn=to_key_fn) logging.info('StratifiedSampler generates stratified examples to %s', output_artifact.uri)
def Do(self, input_dict: Dict[str, List[Artifact]], output_dict: Dict[str, List[Artifact]], exec_properties: Dict[str, Any]) -> None: """Recommends a tuner config. Args: input_dict: Input dict from input key to a list of artifacts, including: - meta_train_features_N: MetaFeatures for Nth train dataset. - hparams_train_N: HParms for Nth train dataset. The maximum value `N` being _MAX_INPUTS. output_dict: Output dict from key to a list of artifacts. exec_properties: A dict of execution properties. Raises: """ algorithm = exec_properties['algorithm'] metafeatures_list = [] # This should be agnostic to meta-feature type. for ix in range(MAX_INPUTS): metafeature_key = f'meta_train_features_{ix}' if metafeature_key in input_dict: metafeature_uri = os.path.join( artifact_utils.get_single_uri(input_dict[metafeature_key]), artifacts.MetaFeatures.DEFAULT_FILE_NAME) logging.info('Found %s at %s.', metafeature_key, metafeature_uri) metafeatures = json.loads( io_utils.read_string_file(metafeature_uri)) metafeatures_list.append(metafeatures['metafeature']) all_hparams = [] for ix in range(MAX_INPUTS): hparam_key = f'hparams_train_{ix}' if hparam_key in input_dict: hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[hparam_key])) logging.info('Found %s at %s.', hparam_key, hyperparameters_file) hparams_json = json.loads( io_utils.read_string_file(hyperparameters_file)) all_hparams.append(hparams_json['values']) if algorithm == MAJORITY_VOTING: discrete_search_space = self._create_search_space_using_voting( all_hparams) hparams_config_list = [discrete_search_space.get_config()] elif algorithm == NEAREST_NEIGHBOR: # Build nearest_neighbor model output_path = artifact_utils.get_single_uri( output_dict[OUTPUT_MODEL]) serving_model_dir = path_utils.serving_model_dir(output_path) model = self._create_knn_model_from_metafeatures(metafeatures_list) # TODO(nikhilmehta): Consider adding signature here. model.save(serving_model_dir) # Collect all Candidate HParams hparams_list = self._convert_to_kerastuner_hyperparameters( all_hparams) hparams_config_list = [ hparam.get_config() for hparam in hparams_list ] else: raise NotImplementedError( f'The algorithm "{algorithm}" is not supported.') meta_hparams_path = os.path.join( artifact_utils.get_single_uri(output_dict[OUTPUT_HYPERPARAMS]), _DEFAULT_FILE_NAME) io_utils.write_string_file(meta_hparams_path, json.dumps(hparams_config_list)) logging.info('Meta HParams saved at %s', meta_hparams_path)