Exemple #1
0
 def testEphemeralPackage(self, mock_mkdtemp):
   mock_mkdtemp.return_value = self._tmp_dir
   if os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR'):
     # This test requires setuptools which is not available.
     logging.info('Skipping testEphemeralPackage')
     return
   package = dependency_utils.build_ephemeral_package()
   self.assertRegex(os.path.basename(package), r'tfx_ephemeral-.*\.tar.gz')
    def testEphemeralPackageMocked(self, mock_subprocess_call, mock_mkdtemp):
        source_data_dir = os.path.join(os.path.dirname(__file__), 'testdata')
        test_file = os.path.join(source_data_dir, 'test.csv')
        expected_package = 'mypackage.tar.gz'

        def side_effect(cmd):
            self.assertEqual(3, len(cmd))
            self.assertEqual(sys.executable, cmd[0])
            self.assertEqual('sdist', cmd[2])
            setup_file = cmd[1]
            dist_dir = os.path.join(os.path.dirname(setup_file), 'dist')
            tf.io.gfile.makedirs(dist_dir)
            dest_file = os.path.join(dist_dir, expected_package)
            tf.io.gfile.copy(test_file, dest_file)

        mock_subprocess_call.side_effect = side_effect
        mock_mkdtemp.return_value = self._tmp_dir
        package = dependency_utils.build_ephemeral_package()
        self.assertEqual(expected_package, os.path.basename(package))
Exemple #3
0
def start_cmle_training(input_dict: Dict[Text, List[types.Artifact]],
                        output_dict: Dict[Text, List[types.Artifact]],
                        exec_properties: Dict[Text, Any],
                        executor_class_path: Text, training_inputs: Dict[Text,
                                                                         Any]):
    """Start a trainer job on CMLE.

  This is done by forwarding the inputs/outputs/exec_properties to the
  tfx.scripts.run_executor module on a CMLE training job interpreter.

  Args:
    input_dict: Passthrough input dict for tfx.components.Trainer.executor.
    output_dict: Passthrough input dict for tfx.components.Trainer.executor.
    exec_properties: Passthrough input dict for tfx.components.Trainer.executor.
    executor_class_path: class path for TFX core default trainer.
    training_inputs: Training input for CMLE training job. 'pythonModule',
      'pythonVersion' and 'runtimeVersion' will be inferred by the runner. For
      the full set of parameters supported, refer to
        https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version.

  Returns:
    None
  Raises:
    RuntimeError: if the Google Cloud AI Platform training job failed.
  """
    training_inputs = training_inputs.copy()
    # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself
    for gaip_training_key in ['cmle_training_args', 'gaip_training_args']:
        if gaip_training_key in exec_properties.get('custom_config'):
            exec_properties['custom_config'].pop(gaip_training_key)

    json_inputs = artifact_utils.jsonify_artifact_dict(input_dict)
    tf.logging.info('json_inputs=\'%s\'.', json_inputs)
    json_outputs = artifact_utils.jsonify_artifact_dict(output_dict)
    tf.logging.info('json_outputs=\'%s\'.', json_outputs)
    json_exec_properties = json.dumps(exec_properties)
    tf.logging.info('json_exec_properties=\'%s\'.', json_exec_properties)

    # Configure CMLE job
    api_client = discovery.build('ml', 'v1')
    job_args = [
        '--executor_class_path', executor_class_path, '--inputs', json_inputs,
        '--outputs', json_outputs, '--exec-properties', json_exec_properties
    ]
    training_inputs['args'] = job_args
    training_inputs['pythonModule'] = 'tfx.scripts.run_executor'
    training_inputs['pythonVersion'] = _get_caip_python_version()
    # runtimeVersion should be same as <major>.<minor> of currently
    # installed tensorflow version.
    training_inputs['runtimeVersion'] = _get_tf_runtime_version()

    # Pop project_id so CMLE doesn't complain about an unexpected parameter.
    # It's been a stowaway in cmle_args and has finally reached its destination.
    project = training_inputs.pop('project')
    project_id = 'projects/{}'.format(project)

    package_uris = training_inputs.get('packageUris', [])
    if package_uris:
        tf.logging.info('Following packageUris \'%s\' are provided by user.',
                        package_uris)
    else:
        local_package = dependency_utils.build_ephemeral_package()
        # TODO(b/125451545): Use a safe temp dir instead of jobDir.
        cloud_package = os.path.join(training_inputs['jobDir'],
                                     os.path.basename(local_package))
        io_utils.copy_file(local_package, cloud_package, True)
        training_inputs['packageUris'] = [cloud_package]
        tf.logging.info('Package %s will be used',
                        training_inputs['packageUris'])

    job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    job_spec = {'jobId': job_name, 'trainingInput': training_inputs}

    # Submit job to CMLE
    tf.logging.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format(
        job_name, project))
    request = api_client.projects().jobs().create(body=job_spec,
                                                  parent=project_id)
    request.execute()

    # Wait for CMLE job to finish
    job_id = '{}/jobs/{}'.format(project_id, job_name)
    request = api_client.projects().jobs().get(name=job_id)
    response = request.execute()
    while response['state'] not in ('SUCCEEDED', 'FAILED'):
        time.sleep(_POLLING_INTERVAL_IN_SECONDS)
        response = request.execute()

    if response['state'] == 'FAILED':
        err_msg = 'Job \'{}\' did not succeed.  Detailed response {}.'.format(
            job_name, response)
        tf.logging.error(err_msg)
        raise RuntimeError(err_msg)

    # CMLE training complete
    tf.logging.info('Job \'{}\' successful.'.format(job_name))