def test_job_id(self, mock_serialize_assets, mock_submit_job):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            with self.assertRaises(RuntimeError):
                client.cloud_fit(self._model,
                                 x=self._dataset,
                                 validation_data=self._dataset,
                                 remote_dir=self._remote_dir,
                                 job_spec=self._job_spec,
                                 batch_size=1,
                                 epochs=2,
                                 verbose=3)
            return

        test_job_id = 'test_job_id'
        client.cloud_fit(self._model,
                         x=self._dataset,
                         validation_data=self._dataset,
                         remote_dir=self._remote_dir,
                         job_spec=self._job_spec,
                         job_id=test_job_id,
                         batch_size=1,
                         epochs=2,
                         verbose=3)

        kargs, _ = mock_submit_job.call_args
        body, _ = kargs
        self.assertDictContainsSubset({
            'job_id': test_job_id,
        }, body)
    def test_serialize_assets(self):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            with self.assertRaises(RuntimeError):
                client.cloud_fit(self._model,
                                 x=self._dataset,
                                 validation_data=self._dataset,
                                 remote_dir=self._remote_dir,
                                 job_spec=self._job_spec,
                                 batch_size=1,
                                 epochs=2,
                                 verbose=3)
            return
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=self._remote_dir)
        args = self._scalar_fit_kwargs
        args['callbacks'] = [tensorboard_callback]

        client._serialize_assets(self._remote_dir, self._model, **args)
        self.assertGreaterEqual(
            len(
                tf.io.gfile.listdir(
                    os.path.join(self._remote_dir, 'training_assets'))), 1)
        self.assertGreaterEqual(
            len(tf.io.gfile.listdir(os.path.join(self._remote_dir, 'model'))),
            1)

        training_assets_graph = tf.saved_model.load(
            os.path.join(self._remote_dir, 'training_assets'))

        pickled_callbacks = tfds.as_numpy(training_assets_graph.callbacks_fn())
        unpickled_callbacks = cloudpickle.loads(pickled_callbacks)
        self.assertIsInstance(unpickled_callbacks[0],
                              tf.keras.callbacks.TensorBoard)
    def test_run(self):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            return

        remote.run(self._remote_dir, MIRRORED_STRATEGY_NAME)
        self.assertGreaterEqual(len(tf.io.gfile.listdir(self._output_dir)), 1)
        self.assertGreaterEqual(len(tf.io.gfile.listdir(self._logs_dir)), 1)

        model = tf.keras.models.load_model(self._output_dir)

        # Test saved model load and works properly
        self.assertGreater(
            model.evaluate(self._x, self._y)[0],
            np.array([0.0], dtype=np.float32))
    def test_in_memory_data(self):
        # Create a folder under remote dir for this test's data
        tmp_folder = str(uuid.uuid4())
        remote_dir = os.path.join(self.remote_dir, tmp_folder)

        # Keep track of test folders created for final clean up
        self.test_folders.append(remote_dir)

        x = np.random.random((2, 3))
        y = np.random.randint(0, 2, (2, 2))

        # TF 1.x is not supported
        if utils.is_tf_v1():
            with self.assertRaises(RuntimeError):
                client.cloud_fit(self.model(),
                                 x=x,
                                 y=y,
                                 remote_dir=remote_dir,
                                 region=self.region,
                                 project_id=self.project_id,
                                 image_uri=self.image_uri,
                                 epochs=2)
            return

        job_id = client.cloud_fit(self.model(),
                                  x=x,
                                  y=y,
                                  remote_dir=remote_dir,
                                  region=self.region,
                                  project_id=self.project_id,
                                  image_uri=self.image_uri,
                                  job_id='cloud_fit_e2e_test_{}_{}'.format(
                                      BUILD_ID.replace('-', '_'),
                                      'test_in_memory_data'),
                                  epochs=2)

        # Wait for AIP Training job to finish
        job_name = 'projects/{}/jobs/{}'.format(self.project_id, job_id)

        # Configure AI Platform training job
        api_client = discovery.build('ml', 'v1')
        request = api_client.projects().jobs().get(name=job_name)
        response = request.execute()
        while response['state'] not in ('SUCCEEDED', 'FAILED'):
            time.sleep(POLLING_INTERVAL_IN_SECONDS)
            response = request.execute()
        self.assertEqual(response['state'], 'SUCCEEDED')
    def test_distribution_strategy(self, mock_serialize_assets,
                                   mock_submit_job):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            with self.assertRaises(RuntimeError):
                client.cloud_fit(self._model,
                                 x=self._dataset,
                                 remote_dir=self._remote_dir)
            return

        client.cloud_fit(self._model,
                         x=self._dataset,
                         remote_dir=self._remote_dir)

        kargs, _ = mock_submit_job.call_args
        body, _ = kargs
        self.assertDictContainsSubset(
            {
                'args': [
                    '--remote_dir', self._remote_dir,
                    '--distribution_strategy',
                    MULTI_WORKER_MIRRORED_STRATEGY_NAME
                ],
            }, body['trainingInput'])

        client.cloud_fit(self._model,
                         x=self._dataset,
                         remote_dir=self._remote_dir,
                         distribution_strategy=MIRRORED_STRATEGY_NAME,
                         job_spec=self._job_spec)

        kargs, _ = mock_submit_job.call_args
        body, _ = kargs
        self.assertDictContainsSubset(
            {
                'args': [
                    '--remote_dir', self._remote_dir,
                    '--distribution_strategy', MIRRORED_STRATEGY_NAME
                ],
            }, body['trainingInput'])

        with self.assertRaises(ValueError):
            client.cloud_fit(self._model,
                             x=self._dataset,
                             remote_dir=self._remote_dir,
                             distribution_strategy='not_implemented_strategy',
                             job_spec=self._job_spec)
    def test_custom_callback(self):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            return

        # Setting up custom callback with mock calls
        _MockCallable.reset()

        self._fit_kwargs['callbacks'] = [CustomCallbackExample()]
        client._serialize_assets(self._remote_dir, self._model,
                                 **self._fit_kwargs)

        # Verify callback function has not been called yet.
        _MockCallable.mock_callable.assert_not_called()

        remote.run(self._remote_dir, MIRRORED_STRATEGY_NAME)
        # Verifying callback functions triggered properly
        _MockCallable.mock_callable.assert_called_once_with()
Example #7
0
  def test_fit_kwargs(self, mock_submit_job):
    # TF 1.x is not supported
    if utils.is_tf_v1():
      with self.assertRaises(RuntimeError):
        client.cloud_fit(
            self._model,
            x=self._dataset,
            validation_data=self._dataset,
            remote_dir=self._remote_dir,
            job_spec=self._job_spec,
            batch_size=1,
            epochs=2,
            verbose=3)
      return
    job_id = client.cloud_fit(
        self._model,
        x=self._dataset,
        validation_data=self._dataset,
        remote_dir=self._remote_dir,
        region=self._region,
        project_id=self._project_id,
        image_uri=self._image_uri,
        batch_size=1,
        epochs=2,
        verbose=3)

    kargs, _ = mock_submit_job.call_args
    body, _ = kargs
    self.assertEqual(body['job_id'], job_id)
    remote_dir = body['trainingInput']['args'][1]

    training_assets_graph = tf.saved_model.load(
        os.path.join(remote_dir, 'training_assets'))
    elements = training_assets_graph.fit_kwargs_fn()
    self.assertDictContainsSubset(
        tfds.as_numpy(elements), {
            'batch_size': 1,
            'epochs': 2,
            'verbose': 3
        })
Example #8
0
  def test_custom_job_spec(self, mock_submit_job):
    # TF 1.x is not supported
    if utils.is_tf_v1():
      with self.assertRaises(RuntimeError):
        client.cloud_fit(
            self._model,
            x=self._dataset,
            validation_data=self._dataset,
            remote_dir=self._remote_dir,
            job_spec=self._job_spec,
            batch_size=1,
            epochs=2,
            verbose=3)
      return

    client.cloud_fit(
        self._model,
        x=self._dataset,
        validation_data=self._dataset,
        remote_dir=self._remote_dir,
        job_spec=self._job_spec,
        batch_size=1,
        epochs=2,
        verbose=3)

    kargs, _ = mock_submit_job.call_args
    body, _ = kargs
    self.assertDictContainsSubset(
        {
            'masterConfig': {
                'imageUri': self._image_uri,
            },
            'args': [
                '--remote_dir', self._remote_dir, '--distribution_strategy',
                MULTI_WORKER_MIRRORED_STRATEGY_NAME
            ],
        }, body['trainingInput'])
Example #9
0
def run(remote_dir, distribution_strategy_text):
    """deserializes Model and Dataset and runs them.

  Args:
    remote_dir: Temporary cloud storage folder that contains model and Dataset
      graph. This folder is also used for job output.
    distribution_strategy_text: Specifies the distribution strategy for remote
      execution when a jobspec is provided. Accepted values are strategy names
      as specified by 'tf.distribute.<strategy>.__name__'.
  """
    logging.info('Setting distribution strategy to %s',
                 distribution_strategy_text)

    is_mwms = distribution_strategy_text == MULTI_WORKER_MIRRORED_STRATEGY_NAME

    distribution_strategy = SUPPORTED_DISTRIBUTION_STRATEGIES[
        distribution_strategy_text]()

    with distribution_strategy.scope():
        if utils.is_tf_v1():
            training_assets_graph = tf.compat.v2.saved_model.load(
                export_dir=os.path.join(remote_dir, 'training_assets'),
                tags=None)
        else:
            training_assets_graph = tf.saved_model.load(
                os.path.join(remote_dir, 'training_assets'))

        fit_kwargs = {}
        if hasattr(training_assets_graph, 'fit_kwargs_fn'):
            fit_kwargs = tfds.as_numpy(training_assets_graph.fit_kwargs_fn())
            logging.info('fit_kwargs were loaded successfully.')

        if hasattr(training_assets_graph, 'x_fn'):
            fit_kwargs['x'] = training_assets_graph.x_fn()
            logging.info('x was loaded successfully.')

        if hasattr(training_assets_graph, 'y_fn'):
            fit_kwargs['y'] = training_assets_graph.y_fn()
            logging.info('y was loaded successfully.')

        if hasattr(training_assets_graph, 'validation_data_fn'):
            fit_kwargs[
                'validation_data'] = training_assets_graph.validation_data_fn(
                )

        if hasattr(training_assets_graph, 'callbacks_fn'):
            pickled_callbacks = tfds.as_numpy(
                training_assets_graph.callbacks_fn())
            fit_kwargs['callbacks'] = cloudpickle.loads(pickled_callbacks)
            logging.info('callbacks were loaded successfully.')

        model = tf.keras.models.load_model(os.path.join(remote_dir, 'model'))
        logging.info('Model was loaded from %s successfully.',
                     os.path.join(remote_dir, 'model'))
        model.fit(**fit_kwargs)

    # We need to set a different directory on workers when using MWMS since we
    # will run into errors due to concurrent writes to the same directory.
    # This is a workaround for the issue described in b/148619319.
    if not _is_current_worker_chief() and is_mwms:
        tmp_worker_dir = os.path.join(
            remote_dir, 'output/tmp/workers_' + str(uuid.uuid4()))
        logging.info('Saving model from worker in temporary folder %s.',
                     tmp_worker_dir)
        model.save(tmp_worker_dir)

        logging.info('Removing temporary folder %s.', tmp_worker_dir)
        _delete_dir(tmp_worker_dir)

    else:
        model.save(os.path.join(remote_dir, 'output'))
def cloud_fit(model,
              remote_dir,
              region=None,
              project_id=None,
              image_uri=None,
              distribution_strategy=DEFAULT_DISTRIBUTION_STRATEGY,
              job_spec=None,
              job_id=None,
              **fit_kwargs):
    """Facilitates remote execution of in memory Model and Dataset on AI Platform.

  Args:
    model: A compiled Keras Model.
    remote_dir: Google Cloud Storage path for temporary assets and AI Platform
      training output. Will overwrite value in job_spec.
    region: Target region for running the AI Platform Training job.
    project_id: Project id where the training should be deployed to.
    image_uri: based image used to use for AI Platform Training
    distribution_strategy: Specifies the distribution strategy for remote
      execution when a jobspec is provided. Accepted values are strategy names
      as specified by 'tf.distribute.<strategy>.__name__'.
    job_spec: AI Platform Training job_spec, will take precedence over all other
      provided values except for remote_dir. If none is provided a default
      cluster spec and distribution strategy will be used.
    job_id: A name to use for the AI Platform Training job (mixed-case letters,
      numbers, and underscores only, starting with a letter).
    **fit_kwargs: Args to pass to model.fit() including training and eval data.
      Only keyword arguments are supported. Callback functions will be
      serialized as is, they must be available in run time environment.

  Returns:
    AI Platform job ID

  Raises:
    RuntimeError: If executing in graph mode, eager execution is required for
    cloud_fit.
    NotImplementedError: Tensorflow v1.x is not supported.
  """
    logging.set_verbosity(logging.INFO)

    if distribution_strategy not in SUPPORTED_DISTRIBUTION_STRATEGIES:
        raise ValueError(
            '{} is not supported. Supported Strategies are {}'.format(
                distribution_strategy,
                list(SUPPORTED_DISTRIBUTION_STRATEGIES.keys())))

    if utils.is_tf_v1():
        raise NotImplementedError('Tensorflow v1.x is not supported.')

    # Can only export Datasets which were created executing eagerly
    # Raise an error if eager execution is not enabled.
    if not tf.executing_eagerly():
        raise RuntimeError('Eager execution is required for cloud_fit.')

    if job_spec:
        job_spec['trainingInput']['args'] = [
            '--remote_dir', remote_dir, '--distribution_strategy',
            distribution_strategy
        ]

    else:
        job_spec = _default_job_spec(region=region,
                                     image_uri=image_uri,
                                     entry_point_args=[
                                         '--remote_dir', remote_dir,
                                         '--distribution_strategy',
                                         distribution_strategy
                                     ])

    _serialize_assets(remote_dir, model, **fit_kwargs)

    # Setting AI Platform Training to use chief in TF_CONFIG environment variable
    # https://cloud.google.com/ai-platform/training/docs/distributed-training-details#chief-versus-master
    job_spec['trainingInput']['useChiefInTfConfig'] = 'True'

    # If job_id is provided overwrite the job_id value.
    if job_id:
        job_spec['job_id'] = job_id

    _submit_job(job_spec, project_id)
    return job_spec['job_id']