Ejemplo n.º 1
0
    def testCustomStubExecutor(self, mock_publisher):
        # verify whether custom stub executor substitution works
        mock_publisher.return_value.publish_execution.return_value = {}

        component_map = \
            {'_FakeComponent.FakeComponent': CustomStubExecutor}

        my_stub_launcher = \
            stub_component_launcher.get_stub_launcher_class(
                test_data_dir=self.record_dir,
                stubbed_component_ids=[],
                stubbed_component_map=component_map)

        launcher = my_stub_launcher.create(
            component=self.component,
            pipeline_info=self.pipeline_info,
            driver_args=self.driver_args,
            metadata_connection=self.metadata_connection,
            beam_pipeline_args=[],
            additional_pipeline_args={})
        launcher.launch()

        output_path = self.component.outputs['output'].get()[0].uri
        generated_file = os.path.join(output_path, 'result.txt')
        self.assertTrue(tf.io.gfile.exists(generated_file))
        contents = io_utils.read_string_file(generated_file)
        self.assertEqual('custom component', contents)
Ejemplo n.º 2
0
    def testStubExecutor(self, mock_publisher):
        # verify whether base stub executor substitution works
        mock_publisher.return_value.publish_execution.return_value = {}

        record_file = os.path.join(self.record_dir, 'output', 'recorded.txt')
        io_utils.write_string_file(record_file, 'hello world')
        component_ids = ['_FakeComponent.FakeComponent']

        my_stub_launcher = \
            stub_component_launcher.get_stub_launcher_class(
                test_data_dir=self.record_dir,
                stubbed_component_ids=component_ids,
                stubbed_component_map={})

        launcher = my_stub_launcher.create(
            component=self.component,
            pipeline_info=self.pipeline_info,
            driver_args=self.driver_args,
            metadata_connection=self.metadata_connection,
            beam_pipeline_args=[],
            additional_pipeline_args={})
        launcher.launch()

        output_path = self.component.outputs['output'].get()[0].uri
        copied_file = os.path.join(output_path, 'recorded.txt')
        self.assertTrue(tf.io.gfile.exists(copied_file))
        contents = io_utils.read_string_file(copied_file)
        self.assertEqual('hello world', contents)
Ejemplo n.º 3
0
    def testRecordBeamPipelineRunId(self, mock_metadata, mock_config):
        # Tests recording Beam pipeline outputs given a run_id.
        with mock.patch.object(pipeline_recorder_utils, '_get_execution_dict',
                               return_value=self.execution_dict
                               ) as mock_get_execution_dict,\
            mock.patch.object(pipeline_recorder_utils, '_get_paths',
                              return_value=self.paths
                              ) as mock_get_paths:
            pipeline_recorder_utils.record_pipeline(
                output_dir=self._base_dir,
                metadata_db_uri=self.metadata_db_uri,
                run_id=self.run_id)

            mock_config.assert_called_with(self.metadata_db_uri)
            mock_metadata.assert_called()
            mock_get_execution_dict.assert_called()
            mock_get_paths.assert_called()

            # Verifying that test.txt has been copied from src_uri to dest_uri
            files = fileio.listdir(self.dest_uri)
            self.assertLen(files, 1)
            self.assertEqual(
                io_utils.read_string_file(os.path.join(self.dest_uri,
                                                       files[0])),
                self.content)
Ejemplo n.º 4
0
    def testExecutor(self, mock_publisher):
        # verify whether original executors can run
        mock_publisher.return_value.publish_execution.return_value = {}

        io_utils.write_string_file(os.path.join(self.input_dir, 'result.txt'),
                                   'test')

        stub_component_launcher.StubComponentLauncher.initialize(
            test_data_dir=self.record_dir,
            test_component_ids=[self.component.id])

        launcher = stub_component_launcher.StubComponentLauncher.create(
            component=self.component,
            pipeline_info=self.pipeline_info,
            driver_args=self.driver_args,
            metadata_connection=self.metadata_connection,
            beam_pipeline_args=[],
            additional_pipeline_args={})
        self.assertEqual(
            launcher._component_info.component_type,  # pylint: disable=protected-access
            '.'.join([  # pylint: disable=protected-access
                test_utils._FakeComponent.__module__,  # pylint: disable=protected-access
                test_utils._FakeComponent.__name__  # pylint: disable=protected-access
            ]))
        launcher.launch()

        output_path = self.component.outputs[self.output_key].get()[0].uri
        self.assertTrue(fileio.exists(output_path))
        contents = io_utils.read_string_file(output_path)
        self.assertEqual('test', contents)
    def testRecordLatestBeamPipeline(self, mock_get_latest_executions,
                                     mock_metadata, mock_config):
        # Tests recording Beam pipeline outputs for the latest execution.
        with mock.patch.object(pipeline_recorder_utils,
                               '_get_paths',
                               return_value=self.paths) as mock_get_paths:
            pipeline_recorder_utils.record_pipeline(
                output_dir=self._base_dir,
                metadata_db_uri=self.metadata_db_uri,
                host=None,
                port=None,
                pipeline_name=self.pipeline_name,
                run_id=None)

            mock_config.assert_called_with(self.metadata_db_uri)
            mock_metadata.assert_called()
            mock_get_paths.assert_called()
            mock_get_latest_executions.assert_called()

            # Verifying that test.txt has been copied from src_uri to dest_uri
            files = tf.io.gfile.listdir(self.dest_uri)
            self.assertLen(files, 1)
            self.assertEqual(
                io_utils.read_string_file(os.path.join(self.dest_uri,
                                                       files[0])),
                self.content)
Ejemplo n.º 6
0
    def testStubExecutor(self, mock_publisher):
        # verify whether base stub executor substitution works
        mock_publisher.return_value.publish_execution.return_value = {}

        record_file = os.path.join(self.record_dir, self.component.id,
                                   self.output_key, '0', 'recorded.txt')
        io_utils.write_string_file(record_file, 'hello world')

        stub_component_launcher.StubComponentLauncher.initialize(
            test_data_dir=self.record_dir, test_component_ids=[])

        launcher = stub_component_launcher.StubComponentLauncher.create(
            component=self.component,
            pipeline_info=self.pipeline_info,
            driver_args=self.driver_args,
            metadata_connection=self.metadata_connection,
            beam_pipeline_args=[],
            additional_pipeline_args={})
        launcher.launch()

        output_path = self.component.outputs[self.output_key].get()[0].uri
        copied_file = os.path.join(output_path, 'recorded.txt')
        self.assertTrue(fileio.exists(copied_file))
        contents = io_utils.read_string_file(copied_file)
        self.assertEqual('hello world', contents)
Ejemplo n.º 7
0
def copy_and_change_pipeline_name(orig_path: str, new_path: str,
                                  origin_pipeline_name: str,
                                  new_pipeline_name: str) -> None:
    """Copy pipeline file to new path with pipeline name changed."""
    contents = io_utils.read_string_file(orig_path)
    assert contents.count(origin_pipeline_name
                          ) == 1, 'DSL file can only contain one pipeline name'
    contents = contents.replace(origin_pipeline_name, new_pipeline_name)
    io_utils.write_string_file(new_path, contents)
Ejemplo n.º 8
0
 def _verify_metafeature_gen_outputs(self):
   self.assertNotEmpty(tf.io.gfile.listdir(self._metafeatures.uri))
   metafeature_path = os.path.join(self._metafeatures.uri,
                                   artifacts.MetaFeatures.DEFAULT_FILE_NAME)
   metafeature = json.loads(io_utils.read_string_file(metafeature_path))
   self.assertEqual(metafeature['num_examples'], 3)
   self.assertEqual(metafeature['num_int_features'], 1)
   self.assertEqual(metafeature['num_float_features'], 1)
   self.assertEqual(metafeature['num_categorical_features'], 2)
Ejemplo n.º 9
0
    def _verify_hparams_outputs(self, algorithm: str):

        path = os.path.join(self._hparams_out.uri, 'meta_hyperparameters.txt')
        self.assertTrue(tf.io.gfile.exists(path))
        hparams_json_list = json.loads(io_utils.read_string_file(path))

        if algorithm == executor.MAJORITY_VOTING:
            self._verify_hparams_values_majority_voting(hparams_json_list)
        elif algorithm == executor.NEAREST_NEIGHBOR:
            self._verify_hparams_values_nearest_neighbor(hparams_json_list)
Ejemplo n.º 10
0
  def test_create_search_space_using_voting(self):

    metadata_indices = [1, 2, 3]
    all_hparams = []
    for dataset_id in metadata_indices:
      hyperparameters_file = os.path.join(self._input_data_dir,
                                          f'Tuner.train_mockdata_{dataset_id}',
                                          'best_hyperparameters',
                                          'best_hyperparameters.txt')
      hparams_json = json.loads(io_utils.read_string_file(hyperparameters_file))
      all_hparams.append(hparams_json['values'])
    ex = executor.MetaLearnerExecutor()
    search_space = ex._create_search_space_using_voting(all_hparams)
    self._verify_hparams_values_majority_voting([search_space.get_config()])
Ejemplo n.º 11
0
  def test_create_knn_model_from_metafeatures(self):

    metadata_indices = [1, 2, 3]
    metafeatures_list = []
    for dataset_id in metadata_indices:
      metafeature_uri = os.path.join(
          self._input_data_dir, f'MetaFeatureGen.train_mockdata_{dataset_id}',
          'metafeatures', 'metafeatures.txt')
      metafeatures = json.loads(io_utils.read_string_file(metafeature_uri))
      metafeatures_list.append(metafeatures['metafeature'])

    ex = executor.MetaLearnerExecutor()
    model = ex._create_knn_model_from_metafeatures(metafeatures_list)
    model_weights = model.get_layer('metafeatures').get_weights()
    self._verify_model_weights(model_weights[0])
Ejemplo n.º 12
0
    def testExitHandlerPipelineSuccess(self):
        """End-to-End test for a successful pipeline with exit handler."""
        pipeline_name = 'kubeflow-v2-exit-handler-test-{}'.format(
            orchestration_test_utils.random_id())

        components = test_utils.simple_pipeline_components(_TEST_DATA_ROOT)

        beam_pipeline_args = [
            '--temp_location=' + os.path.join(
                self._pipeline_root(pipeline_name), 'dataflow', 'temp'),
            '--project={}'.format(self._GCP_PROJECT_ID)
        ]

        pipeline = self._create_pipeline(pipeline_name, components,
                                         beam_pipeline_args)

        output_file_dir = os.path.join(self._pipeline_root(pipeline_name),
                                       _success_file_name)

        exit_handler = custom_exit_handler.test_exit_handler(
            final_status=tfx.orchestration.experimental.FinalStatusStr(),
            file_dir=output_file_dir)

        self._run_pipeline(pipeline=pipeline, exit_handler=exit_handler)

        # verify execution results
        actual_final_status_str = io_utils.read_string_file(output_file_dir)
        expected_successful_final_status_str = """
      {
        "state":"SUCCEEDED",
        "error":{}
      }
    """

        expected_successful_final_status = (
            pipeline_spec_pb2.PipelineTaskFinalStatus())
        json_format.Parse(expected_successful_final_status_str,
                          expected_successful_final_status)

        actual_final_status = pipeline_spec_pb2.PipelineTaskFinalStatus()
        json_format.Parse(actual_final_status_str, actual_final_status)

        self.assertProtoPartiallyEquals(
            expected_successful_final_status,
            actual_final_status,
            ignored_fields=['pipeline_job_resource_name'])
Ejemplo n.º 13
0
  def testRecordLatestKfpPipeline(self, mock_get_latest_executions):
    # Tests recording KFP pipeline outputs for the latest execution.
    with mock.patch.object(
        pipeline_recorder_utils, '_get_paths',
        return_value=self.paths) as mock_get_paths:
      pipeline_recorder_utils.record_pipeline(
          output_dir=self._base_dir,
          host=self.host,
          port=self.port,
          pipeline_name=self.pipeline_name)
      mock_get_paths.assert_called()
      mock_get_latest_executions.assert_called()

      files = fileio.listdir(self.dest_uri)
      self.assertLen(files, 1)
      self.assertEqual(
          io_utils.read_string_file(os.path.join(self.dest_uri, files[0])),
          self.content)
Ejemplo n.º 14
0
    def testRecordKfpPipelineRunId(self):
        # Tests recording KFP pipeline outputs given a run_id.
        with mock.patch.object(pipeline_recorder_utils, '_get_execution_dict',
                               return_value=self.execution_dict
                               ) as mock_get_execution_dict,\
            mock.patch.object(pipeline_recorder_utils, '_get_paths',
                              return_value=self.paths) as mock_get_paths:
            pipeline_recorder_utils.record_pipeline(output_dir=self._base_dir,
                                                    host=self.host,
                                                    port=self.port,
                                                    run_id=self.run_id)

            mock_get_execution_dict.assert_called()
            mock_get_paths.assert_called()

            # Verifying that test.txt has been copied from src_uri to dest_uri
            files = tf.io.gfile.listdir(self.dest_uri)
            self.assertLen(files, 1)
            self.assertEqual(
                io_utils.read_string_file(os.path.join(self.dest_uri,
                                                       files[0])),
                self.content)
Ejemplo n.º 15
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Transform a model with the provided function.

    ...

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_model: A list of type `standard_artifacts.Model`
        - pipeline_configuration: optional PipelineConfiguration artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output_model: A list of type `standard_artifacts.Model`
      exec_properties: A dict of execution properties, including:
        - function_name: The name of the function to apply on the model - identity function is used if not specified.
        - instance_name: Optional unique instance_name. Necessary iff multiple Hello components
          are declared in the same pipeline.

    Returns:
      None

    Raises:
      OSError and its subclasses
      ValueError
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        input_model = artifact_utils.get_single_instance(
            input_dict[INPUT_MODEL_KEY])
        output_model = artifact_utils.get_single_instance(
            output_dict[OUTPUT_MODEL_KEY])
        function_name = exec_properties.get(
            FUNCTION_NAME_KEY,
            'tfx_x.components.model.transform.executor.identity')

        pipeline_configuration = {}
        if PIPELINE_CONFIGURATION_KEY in input_dict:
            pipeline_configuration_dir = artifact_utils.get_single_uri(
                input_dict[PIPELINE_CONFIGURATION_KEY])
            pipeline_configuration_file = os.path.join(
                pipeline_configuration_dir, 'custom_config.json')
            pipeline_configuration_str = io_utils.read_string_file(
                pipeline_configuration_file)
            pipeline_configuration = json.loads(pipeline_configuration_str)

        # check if function_name can be found
        function_name_split = function_name.split('.')
        module_name = '.'.join(function_name_split[0:-1])
        module = importlib.import_module(module_name)

        fn = getattr(module, function_name_split[-1])

        if fn is None:
            raise ValueError('`function_name` not found')

        input_dir = artifact_utils.get_single_uri([input_model])
        output_dir = artifact_utils.get_single_uri([output_model])

        # load the model
        model = tf.keras.models.load_model(
            os.path.join(input_dir, 'Format-Serving'))

        # transform
        new_model, signatures, options = fn(model, pipeline_configuration)

        # save the model
        tf.saved_model.save(model, os.path.join(output_dir, 'Format-Serving'),
                            signatures, options)
Ejemplo n.º 16
0
  def Do(self,
         input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Export a model with the provided function.

    ...

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model: A list of type `standard_artifacts.Model`
        - pipeline_configuration: optional PipelineConfiguration artifact.
        - model_blessing: optional model blessing artifact.
        - infra_blessing: optional infra blessing artifact.
        - pushed_model: optional pushed model artifact.
        - transform_graph: optional transform graph artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: model export artifact.
      exec_properties: A dict of execution properties, including:
        - function_name: The name of the function to apply on the model - noop function is used if not specified.
        - instance_name: Optional unique instance_name. Necessary iff multiple Hello components
          are declared in the same pipeline.

    Returns:
      None

    Raises:
      OSError and its subclasses
      ValueError
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    if not self.CheckBlessing(input_dict):
      return

    model = artifact_utils.get_single_instance(
      input_dict[MODEL_KEY])

    output = artifact_utils.get_single_instance(
      output_dict[OUTPUT_KEY])

    model_push_artifact = None
    if standard_component_specs.PUSHED_MODEL_KEY in input_dict:
      model_push_artifact = artifact_utils.get_single_instance(
        input_dict[standard_component_specs.PUSHED_MODEL_KEY])

    transform_graph_artifact = None
    if standard_component_specs.TRANSFORM_GRAPH_KEY in input_dict:
      transform_graph_artifact = artifact_utils.get_single_instance(
        input_dict[standard_component_specs.TRANSFORM_GRAPH_KEY])

    function_name = exec_properties.get(FUNCTION_NAME_KEY, 'tfx_x.components.model.export.executor.noop')

    pipeline_configuration = {}
    if PIPELINE_CONFIGURATION_KEY in input_dict:
      pipeline_configuration_dir = artifact_utils.get_single_uri(input_dict[PIPELINE_CONFIGURATION_KEY])
      pipeline_configuration_file = os.path.join(pipeline_configuration_dir, 'custom_config.json')
      pipeline_configuration_str = io_utils.read_string_file(pipeline_configuration_file)
      pipeline_configuration = json.loads(pipeline_configuration_str)

    # check if function_name can be found
    function_name_split = function_name.split('.')
    module_name = '.'.join(function_name_split[0:-1])
    module = importlib.import_module(module_name)

    fn = getattr(module, function_name_split[-1])

    if fn is None:
      raise ValueError('`function_name` not found')

    input_dir = artifact_utils.get_single_uri([model])
    output_dir = artifact_utils.get_single_uri([output])

    model_push_dir = None
    if model_push_artifact is not None:
      model_push_dir = artifact_utils.get_single_uri([model_push_artifact])

    # load the model
    model = tf.keras.models.load_model(os.path.join(input_dir, 'Format-Serving'))

    # export
    fn(model, pipeline_configuration, output_dir, model_push_dir, model_push_artifact, transform_graph_artifact)
Ejemplo n.º 17
0
    def testOverrideRegisterExecution(self):
        # Mock all real operations of driver / executor / MLMD accesses.
        mock_targets = (  # (cls, method, return_value)
            (beam_executor_operator.BeamExecutorOperator, '__init__', None),
            (beam_executor_operator.BeamExecutorOperator, 'run_executor',
             execution_result_pb2.ExecutorOutput()),
            (python_driver_operator.PythonDriverOperator, '__init__', None),
            (python_driver_operator.PythonDriverOperator, 'run_driver',
             driver_output_pb2.DriverOutput()),
            (metadata.Metadata, '__init__', None),
            (metadata.Metadata, '__exit__', None),
            (launcher.Launcher, '_publish_successful_execution', None),
            (launcher.Launcher, '_clean_up_stateless_execution_info', None),
            (launcher.Launcher, '_clean_up_stateful_execution_info', None),
            (outputs_utils, 'OutputsResolver', mock.MagicMock()),
            (execution_lib, 'get_executions_associated_with_all_contexts', []),
            (container_entrypoint, '_dump_ui_metadata', None),
        )
        for cls, method, return_value in mock_targets:
            self.enter_context(
                mock.patch.object(cls,
                                  method,
                                  autospec=True,
                                  return_value=return_value))

        mock_mlmd = self.enter_context(
            mock.patch.object(metadata.Metadata, '__enter__',
                              autospec=True)).return_value
        mock_mlmd.store.return_value.get_executions_by_id.return_value = [
            metadata_store_pb2.Execution()
        ]

        self._set_required_env_vars({
            'WORKFLOW_ID':
            'workflow-id-42',
            'METADATA_GRPC_SERVICE_HOST':
            'metadata-grpc',
            'METADATA_GRPC_SERVICE_PORT':
            '8080',
            container_entrypoint._KFP_POD_NAME_ENV_KEY:
            'test_pod_name'
        })

        mock_register_execution = self.enter_context(
            mock.patch.object(execution_publish_utils,
                              'register_execution',
                              autospec=True))

        test_ir_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), 'testdata',
            'two_step_pipeline_post_dehydrate_ir.json')
        test_ir = io_utils.read_string_file(test_ir_file)

        argv = [
            '--pipeline_root',
            'dummy',
            '--kubeflow_metadata_config',
            json_format.MessageToJson(
                kubeflow_dag_runner.get_default_kubeflow_metadata_config()),
            '--tfx_ir',
            test_ir,
            '--node_id',
            'BigQueryExampleGen',
            '--runtime_parameter',
            'pipeline-run-id=STRING:my-run-id',
        ]
        container_entrypoint.main(argv)

        mock_register_execution.assert_called_once()
        kwargs = mock_register_execution.call_args[1]
        self.assertEqual(
            kwargs['exec_properties']
            [container_entrypoint._KFP_POD_NAME_PROPERTY_KEY], 'test_pod_name')
Ejemplo n.º 18
0
 def testReadWriteString(self):
     file_path = os.path.join(self._base_dir, 'test_file')
     content = 'testing read/write'
     io_utils.write_string_file(file_path, content)
     read_content = io_utils.read_string_file(file_path)
     self.assertEqual(content, read_content)
Ejemplo n.º 19
0
    def warmup(self, input_dict: Dict[str, List[types.Artifact]],
               exec_properties: Dict[str,
                                     List[types.Artifact]], algorithm: str):

        # Perform warmup tuning if WARMUP_HYPERPARAMETERS given.
        hparams_warmup_config_list = None
        if input_dict.get(WARMUP_HYPERPARAMETERS):
            hyperparameters_file = io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(
                    input_dict[WARMUP_HYPERPARAMETERS]))
            hparams_warmup_config_list = json.loads(
                io_utils.read_string_file(hyperparameters_file))

        fn_args = fn_args_utils.get_common_fn_args(
            input_dict,
            exec_properties,
            working_dir=self._get_tmp_dir() + 'warmup')

        # TODO(nikhilmehta): Currently all algorithms need warmup_hyperparameters.
        # This may not be needed for other algorithms that can predict hyperparams.
        if not hparams_warmup_config_list:
            raise ValueError('Expected warmup_hyperparameters')

        logging.info('Algorithm: %s', algorithm)
        warmup_trials = 0
        if algorithm == 'majority_voting':
            warmup_trials = DEFAULT_WARMUP_TRIALS
            fn_args.custom_config[
                WARMUP_HYPERPARAMETERS] = hparams_warmup_config_list[0]
        elif algorithm == 'nearest_neighbor':
            warmup_trials = DEFAULT_WARMUP_TRIALS

            if input_dict.get('metamodel'):
                metamodel_path = io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(input_dict['metamodel']))
                logging.info('Meta model path: %s', metamodel_path)
                metamodel = _load_keras_model(metamodel_path)
            else:
                raise ValueError(
                    f'Tuner for metalearning_algorithm={algorithm} expects metamodel.'
                )

            if input_dict.get('metafeature'):
                metafeature_path = io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(input_dict['metafeature']))
                logging.info('Metafeature: %s', metafeature_path)
                metafeature = json.loads(
                    io_utils.read_string_file(metafeature_path))
                metafeature = metafeature['metafeature']
            else:
                raise ValueError(
                    f'Tuner for metalearning_algorithm={algorithm} expects metafeature.'
                )

            metafeature = np.array(metafeature, dtype=np.float32)
            metafeature = np.expand_dims(metafeature, axis=0)
            logits = metamodel(metafeature).numpy()[0]
            nearest_configs = [
                hparams_warmup_config_list[ix]
                for ix in np.argsort(logits)[-DEFAULT_K:]
            ]
            nearest_hparam_config = _merge_hparam_configs(nearest_configs)
            fn_args.custom_config[
                WARMUP_HYPERPARAMETERS] = nearest_hparam_config
        else:
            raise NotImplementedError(
                f'Tuning for metalearning_algorithm={algorithm} is not implemented.'
            )

        # kerastuner doesn't support grid search, setting max_trials large enough.
        # Track issue: https://github.com/keras-team/keras-tuner/issues/340
        fn_args.custom_config['max_trials'] = warmup_trials
        tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn')
        warmtuner_fn_result = tuner_fn(fn_args)
        warmup_tuner = self.search(warmtuner_fn_result)

        return warmup_tuner, warmup_trials
Ejemplo n.º 20
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Runs stratified sampling on given input examples.
    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - examples: examples for inference.
        - pipeline_configuration: optional PipelineConfiguration artifact.
      output_dict: Output dict from output key to a list of Artifacts.
        - stratified_examples: the stratified examples.
      exec_properties: A dict of execution properties.
        - splits_to_transform: list of splits to transform.
        - splits_to_copy: list of splits to copy as is.
        - to_key_fn: the function that will extract the key - must be 'to_key: Example -> key
        - to_key_fn_key: alternate name for the key containing the def of `to_key()`
        - samples_per_key: the number samples per classes
    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        examples = input_dict[EXAMPLES_KEY]

        # Priority is as follow:
        # 1. default value
        # 2. from PipelineConfiguration
        # 3. from exec_properties

        splits_to_transform = []
        samples_per_key = None
        to_key_fn = None
        to_key_fn_key = exec_properties[
            TO_KEY_FN_KEY_KEY] if TO_KEY_FN_KEY_KEY in exec_properties else TO_KEY_FN_KEY

        splits_to_copy = artifact_utils.decode_split_names(
            artifact_utils.get_single_instance(examples).split_names)

        if PIPELINE_CONFIGURATION_KEY in input_dict:
            pipeline_configuration_dir = artifact_utils.get_single_uri(
                input_dict[PIPELINE_CONFIGURATION_KEY])
            pipeline_configuration_file = os.path.join(
                pipeline_configuration_dir, 'custom_config.json')
            pipeline_configuration_str = io_utils.read_string_file(
                pipeline_configuration_file)
            pipeline_configuration = json.loads(pipeline_configuration_str)

            if SPLITS_TO_TRANSFORM_KEY in pipeline_configuration:
                splits_to_transform = pipeline_configuration[
                    SPLITS_TO_TRANSFORM_KEY]
            else:
                splits_to_transform = []

            if SPLITS_TO_COPY_KEY in pipeline_configuration:
                splits_to_copy = pipeline_configuration[SPLITS_TO_COPY_KEY]

            if to_key_fn_key in pipeline_configuration:
                to_key_fn = pipeline_configuration[to_key_fn_key]

            if SAMPLES_PER_KEY_KEY in pipeline_configuration:
                samples_per_key = pipeline_configuration[SAMPLES_PER_KEY_KEY]

        # Now looking at the exec_properties
        if SPLITS_TO_TRANSFORM_KEY in exec_properties and exec_properties[
                SPLITS_TO_TRANSFORM_KEY] is not None:
            splits_to_transform = json_utils.loads(
                exec_properties[SPLITS_TO_TRANSFORM_KEY])

        if SPLITS_TO_COPY_KEY in exec_properties and exec_properties[
                SPLITS_TO_COPY_KEY] is not None:
            splits_to_copy = json_utils.loads(
                exec_properties[SPLITS_TO_COPY_KEY])

        if TO_KEY_FN_KEY in exec_properties and exec_properties[
                TO_KEY_FN_KEY] is not None:
            to_key_fn = exec_properties[TO_KEY_FN_KEY]

        if to_key_fn_key in exec_properties and exec_properties[
                to_key_fn_key] is not None:
            to_key_fn = exec_properties[to_key_fn_key]

        if SAMPLES_PER_KEY_KEY in exec_properties and exec_properties[
                SAMPLES_PER_KEY_KEY] is not None:
            samples_per_key = exec_properties[SAMPLES_PER_KEY_KEY]

        # Validate we have all we need
        if to_key_fn is None:
            raise ValueError('\'to_key_fn\' is missing in exec dict.')

        if samples_per_key is None:
            raise ValueError('\'samples_per_key\' is missing in exec dict.')

        if EXAMPLES_KEY not in input_dict:
            raise ValueError('\'examples\' is missing in input dict.')

        if STRATIFIED_EXAMPLES_KEY not in output_dict:
            raise ValueError(
                '\'stratified_examples\' is missing in output dict.')

        output_artifact = artifact_utils.get_single_instance(
            output_dict[STRATIFIED_EXAMPLES_KEY])
        output_artifact.split_names = artifact_utils.encode_split_names(
            splits_to_transform + splits_to_copy)

        example_uris = {}

        for split in splits_to_transform:
            data_uri = artifact_utils.get_split_uri(examples, split)
            example_uris[split] = data_uri

        # do something with the splits we dont want to transform ('splits_to_copy')
        utils.copy_over(examples, output_artifact, splits_to_copy)

        self._run_sampling(example_uris,
                           output_artifact=output_artifact,
                           samples_per_key=samples_per_key,
                           to_key_fn=to_key_fn)

        logging.info('StratifiedSampler generates stratified examples to %s',
                     output_artifact.uri)
Ejemplo n.º 21
0
    def Do(self, input_dict: Dict[str, List[Artifact]],
           output_dict: Dict[str, List[Artifact]],
           exec_properties: Dict[str, Any]) -> None:
        """Recommends a tuner config.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - meta_train_features_N: MetaFeatures for Nth train dataset.
        - hparams_train_N: HParms for Nth train dataset. The maximum value `N`
          being _MAX_INPUTS.
      output_dict: Output dict from key to a list of artifacts.
      exec_properties: A dict of execution properties.

    Raises:
    """

        algorithm = exec_properties['algorithm']
        metafeatures_list = []
        # This should be agnostic to meta-feature type.
        for ix in range(MAX_INPUTS):
            metafeature_key = f'meta_train_features_{ix}'
            if metafeature_key in input_dict:
                metafeature_uri = os.path.join(
                    artifact_utils.get_single_uri(input_dict[metafeature_key]),
                    artifacts.MetaFeatures.DEFAULT_FILE_NAME)
                logging.info('Found %s at %s.', metafeature_key,
                             metafeature_uri)
                metafeatures = json.loads(
                    io_utils.read_string_file(metafeature_uri))
                metafeatures_list.append(metafeatures['metafeature'])

        all_hparams = []
        for ix in range(MAX_INPUTS):
            hparam_key = f'hparams_train_{ix}'
            if hparam_key in input_dict:
                hyperparameters_file = io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(input_dict[hparam_key]))
                logging.info('Found %s at %s.', hparam_key,
                             hyperparameters_file)
                hparams_json = json.loads(
                    io_utils.read_string_file(hyperparameters_file))
                all_hparams.append(hparams_json['values'])

        if algorithm == MAJORITY_VOTING:
            discrete_search_space = self._create_search_space_using_voting(
                all_hparams)
            hparams_config_list = [discrete_search_space.get_config()]
        elif algorithm == NEAREST_NEIGHBOR:
            # Build nearest_neighbor model
            output_path = artifact_utils.get_single_uri(
                output_dict[OUTPUT_MODEL])
            serving_model_dir = path_utils.serving_model_dir(output_path)
            model = self._create_knn_model_from_metafeatures(metafeatures_list)
            # TODO(nikhilmehta): Consider adding signature here.
            model.save(serving_model_dir)

            # Collect all Candidate HParams
            hparams_list = self._convert_to_kerastuner_hyperparameters(
                all_hparams)
            hparams_config_list = [
                hparam.get_config() for hparam in hparams_list
            ]
        else:
            raise NotImplementedError(
                f'The algorithm "{algorithm}" is not supported.')

        meta_hparams_path = os.path.join(
            artifact_utils.get_single_uri(output_dict[OUTPUT_HYPERPARAMS]),
            _DEFAULT_FILE_NAME)
        io_utils.write_string_file(meta_hparams_path,
                                   json.dumps(hparams_config_list))
        logging.info('Meta HParams saved at %s', meta_hparams_path)