def test_job_id(self, mock_serialize_assets, mock_submit_job): # TF 1.x is not supported if utils.is_tf_v1(): with self.assertRaises(RuntimeError): client.cloud_fit( self._model, x=self._dataset, validation_data=self._dataset, remote_dir=self._remote_dir, job_spec=self._job_spec, batch_size=1, epochs=2, verbose=3, ) return test_job_id = "test_job_id" client.cloud_fit( self._model, x=self._dataset, validation_data=self._dataset, remote_dir=self._remote_dir, job_spec=self._job_spec, job_id=test_job_id, batch_size=1, epochs=2, verbose=3, ) kargs, _ = mock_submit_job.call_args body, _ = kargs self.assertDictContainsSubset({ "job_id": test_job_id, }, body)
def test_distribution_strategy(self, mock_serialize_assets, mock_submit_job): # TF 1.x is not supported if utils.is_tf_v1(): with self.assertRaises(RuntimeError): client.cloud_fit(self._model, x=self._dataset, remote_dir=self._remote_dir) return client.cloud_fit(self._model, x=self._dataset, remote_dir=self._remote_dir) kargs, _ = mock_submit_job.call_args body, _ = kargs self.assertDictContainsSubset( { "args": [ "--remote_dir", self._remote_dir, "--distribution_strategy", MULTI_WORKER_MIRRORED_STRATEGY_NAME, ], }, body["trainingInput"], ) client.cloud_fit( self._model, x=self._dataset, remote_dir=self._remote_dir, distribution_strategy=MIRRORED_STRATEGY_NAME, job_spec=self._job_spec, ) kargs, _ = mock_submit_job.call_args body, _ = kargs self.assertDictContainsSubset( { "args": [ "--remote_dir", self._remote_dir, "--distribution_strategy", MIRRORED_STRATEGY_NAME, ], }, body["trainingInput"], ) with self.assertRaises(ValueError): client.cloud_fit( self._model, x=self._dataset, remote_dir=self._remote_dir, distribution_strategy="not_implemented_strategy", job_spec=self._job_spec, )
def test_run(self): # TF 1.x is not supported if utils.is_tf_v1(): return remote.run(self._remote_dir, MIRRORED_STRATEGY_NAME) self.assertGreaterEqual(len(tf.io.gfile.listdir(self._output_dir)), 1) self.assertGreaterEqual(len(tf.io.gfile.listdir(self._logs_dir)), 1) model = tf.keras.models.load_model(self._output_dir) # Test saved model load and works properly self.assertGreater( model.evaluate(self._x, self._y)[0], np.array([0.0], dtype=np.float32))
def test_custom_callback(self): # TF 1.x is not supported if utils.is_tf_v1(): return # Setting up custom callback with mock calls _MockCallable.reset() self._fit_kwargs["callbacks"] = [CustomCallbackExample()] client._serialize_assets(self._remote_dir, self._model, **self._fit_kwargs) # Verify callback function has not been called yet. _MockCallable.mock_callable.assert_not_called() remote.run(self._remote_dir, MIRRORED_STRATEGY_NAME) # Verifying callback functions triggered properly _MockCallable.mock_callable.assert_called_once_with()
def test_custom_job_spec(self, mock_submit_job): # TF 1.x is not supported if utils.is_tf_v1(): with self.assertRaises(RuntimeError): client.cloud_fit( self._model, x=self._dataset, validation_data=self._dataset, remote_dir=self._remote_dir, job_spec=self._job_spec, batch_size=1, epochs=2, verbose=3, ) return client.cloud_fit( self._model, x=self._dataset, validation_data=self._dataset, remote_dir=self._remote_dir, job_spec=self._job_spec, batch_size=1, epochs=2, verbose=3, ) kargs, _ = mock_submit_job.call_args body, _ = kargs self.assertDictContainsSubset( { "masterConfig": { "imageUri": self._image_uri, }, "args": [ "--remote_dir", self._remote_dir, "--distribution_strategy", MULTI_WORKER_MIRRORED_STRATEGY_NAME, ], }, body["trainingInput"], )
def test_in_memory_data(self): # This test should only run in tf 2.x if utils.is_tf_v1(): return # Create a folder under remote dir for this test's data tmp_folder = str(uuid.uuid4()) remote_dir = os.path.join(self._remote_dir, tmp_folder) # Keep track of test folders created for final clean up self._test_folders.append(remote_dir) x = np.random.random((2, 3)) y = np.random.randint(0, 2, (2, 2)) job_id = client.cloud_fit( self._model(), x=x, y=y, remote_dir=remote_dir, region=self._region, project_id=self._project_id, image_uri=self._image_uri, job_id="cloud_fit_e2e_test_{}_{}".format( _BUILD_ID.replace("-", "_"), "test_in_memory_data" ), epochs=2, ) # TODO(b/169297404) Replace AIP job status logic with utils wrapper # Wait for AIP Training job to finish successfully self.assertTrue( google_api_client.wait_for_api_training_job_completion( job_id, self._project_id)) # load model from remote dir trained_model = tf.keras.models.load_model(os.path.join( remote_dir, "output")) eval_results = trained_model.evaluate(x, y) # Accuracy should be better than zero self.assertListEqual(trained_model.metrics_names, ["loss", "accuracy"]) self.assertGreater(eval_results[1], 0)
def test_client_with_tf_1x_raises_error(self): # This test is only applicable to TF 1.x if not utils.is_tf_v1(): return x = np.random.random((2, 3)) y = np.random.randint(0, 2, (2, 2)) # TF 1.x is not supported, verify proper error is raised for TF 1.x. with self.assertRaises(RuntimeError): client.cloud_fit( self._model(), x=x, y=y, remote_dir="gs://some_test_dir", region=self._region, project_id=self._project_id, image_uri=self._image_uri, epochs=2, )
def test_fit_kwargs(self, mock_submit_job): # TF 1.x is not supported if utils.is_tf_v1(): with self.assertRaises(RuntimeError): client.cloud_fit( self._model, x=self._dataset, validation_data=self._dataset, remote_dir=self._remote_dir, job_spec=self._job_spec, batch_size=1, epochs=2, verbose=3, ) return job_id = client.cloud_fit( self._model, x=self._dataset, validation_data=self._dataset, remote_dir=self._remote_dir, region=self._region, project_id=self._project_id, image_uri=self._image_uri, batch_size=1, epochs=2, verbose=3, ) kargs, _ = mock_submit_job.call_args body, _ = kargs self.assertEqual(body["job_id"], job_id) remote_dir = body["trainingInput"]["args"][1] training_assets_graph = tf.saved_model.load( os.path.join(remote_dir, "training_assets")) elements = training_assets_graph.fit_kwargs_fn() self.assertDictContainsSubset(tfds.as_numpy(elements), { "batch_size": 1, "epochs": 2, "verbose": 3 })
def test_serialize_assets(self): # TF 1.x is not supported if utils.is_tf_v1(): with self.assertRaises(RuntimeError): client.cloud_fit( self._model, x=self._dataset, validation_data=self._dataset, remote_dir=self._remote_dir, job_spec=self._job_spec, batch_size=1, epochs=2, verbose=3, ) return tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=self._remote_dir) args = self._scalar_fit_kwargs args["callbacks"] = [tensorboard_callback] client._serialize_assets(self._remote_dir, self._model, **args) self.assertGreaterEqual( len( tf.io.gfile.listdir( os.path.join(self._remote_dir, "training_assets"))), 1, ) self.assertGreaterEqual( len(tf.io.gfile.listdir(os.path.join(self._remote_dir, "model"))), 1) training_assets_graph = tf.saved_model.load( os.path.join(self._remote_dir, "training_assets")) pickled_callbacks = tfds.as_numpy(training_assets_graph.callbacks_fn()) unpickled_callbacks = cloudpickle.loads(pickled_callbacks) self.assertIsInstance(unpickled_callbacks[0], tf.keras.callbacks.TensorBoard)
def run( remote_dir: Text, distribution_strategy_text: Text ) -> None: """deserializes Model and Dataset and runs them. Args: remote_dir: Temporary cloud storage folder that contains model and Dataset graph. This folder is also used for job output. distribution_strategy_text: Specifies the distribution strategy for remote execution when a jobspec is provided. Accepted values are strategy names as specified by 'tf.distribute.<strategy>.__name__'. """ logging.info("Setting distribution strategy to %s", distribution_strategy_text) is_mwms = distribution_strategy_text == MULTI_WORKER_MIRRORED_STRATEGY_NAME distribution_strategy = SUPPORTED_DISTRIBUTION_STRATEGIES[ distribution_strategy_text ]() with distribution_strategy.scope(): if utils.is_tf_v1(): training_assets_graph = tf.compat.v2.saved_model.load( export_dir=os.path.join(remote_dir, "training_assets"), tags=None) else: training_assets_graph = tf.saved_model.load( os.path.join(remote_dir, "training_assets") ) fit_kwargs = {} if hasattr(training_assets_graph, "fit_kwargs_fn"): fit_kwargs = tfds.as_numpy(training_assets_graph.fit_kwargs_fn()) logging.info("fit_kwargs were loaded successfully.") if hasattr(training_assets_graph, "x_fn"): fit_kwargs["x"] = training_assets_graph.x_fn() logging.info("x was loaded successfully.") if hasattr(training_assets_graph, "y_fn"): fit_kwargs["y"] = training_assets_graph.y_fn() logging.info("y was loaded successfully.") if hasattr(training_assets_graph, "validation_data_fn"): fit_kwargs["validation_data"] = ( training_assets_graph.validation_data_fn()) if hasattr(training_assets_graph, "callbacks_fn"): pickled_callbacks = tfds.as_numpy( training_assets_graph.callbacks_fn()) fit_kwargs["callbacks"] = pickle.loads(pickled_callbacks) logging.info("callbacks were loaded successfully.") model = tf.keras.models.load_model(os.path.join(remote_dir, "model")) logging.info( "Model was loaded from %s successfully.", os.path.join(remote_dir, "model") ) model.fit(**fit_kwargs) # We need to set a different directory on workers when using MWMS since we # will run into errors due to concurrent writes to the same directory. # This is a workaround for the issue described in b/148619319. if not _is_current_worker_chief() and is_mwms: tmp_worker_dir = os.path.join( remote_dir, "output/tmp/workers_" + str(uuid.uuid4()) ) logging.info("Saving model from worker in temporary folder %s.", tmp_worker_dir) model.save(tmp_worker_dir) logging.info("Removing temporary folder %s.", tmp_worker_dir) _delete_dir(tmp_worker_dir) else: model.save(os.path.join(remote_dir, "output"))
def cloud_fit(model, remote_dir, region=None, project_id=None, image_uri=None, distribution_strategy=DEFAULT_DISTRIBUTION_STRATEGY, job_spec=None, job_id=None, **fit_kwargs): """Facilitates remote execution of in memory Model and Dataset on AI Platform. Args: model: A compiled Keras Model. remote_dir: Google Cloud Storage path for temporary assets and AI Platform training output. Will overwrite value in job_spec. region: Target region for running the AI Platform Training job. project_id: Project id where the training should be deployed to. image_uri: based image used to use for AI Platform Training distribution_strategy: Specifies the distribution strategy for remote execution when a jobspec is provided. Accepted values are strategy names as specified by 'tf.distribute.<strategy>.__name__'. job_spec: AI Platform Training job_spec, will take precedence over all other provided values except for remote_dir. If none is provided a default cluster spec and distribution strategy will be used. job_id: A name to use for the AI Platform Training job (mixed-case letters, numbers, and underscores only, starting with a letter). **fit_kwargs: Args to pass to model.fit() including training and eval data. Only keyword arguments are supported. Callback functions will be serialized as is, they must be available in run time environment. Returns: AI Platform job ID Raises: RuntimeError: If executing in graph mode, eager execution is required for cloud_fit. NotImplementedError: Tensorflow v1.x is not supported. """ logging.set_verbosity(logging.INFO) if distribution_strategy not in SUPPORTED_DISTRIBUTION_STRATEGIES: raise ValueError( "{} is not supported. Supported Strategies are {}".format( distribution_strategy, list(SUPPORTED_DISTRIBUTION_STRATEGIES.keys()), )) if utils.is_tf_v1(): raise NotImplementedError("Tensorflow v1.x is not supported.") # Can only export Datasets which were created executing eagerly # Raise an error if eager execution is not enabled. if not tf.executing_eagerly(): raise RuntimeError("Eager execution is required for cloud_fit.") if job_spec: job_spec["trainingInput"]["args"] = [ "--remote_dir", remote_dir, "--distribution_strategy", distribution_strategy, ] else: job_spec = _default_job_spec( region=region, image_uri=image_uri, entry_point_args=[ "--remote_dir", remote_dir, "--distribution_strategy", distribution_strategy, ], ) _serialize_assets(remote_dir, model, **fit_kwargs) # Setting AI Platform Training to use chief in TF_CONFIG environment variable # https://cloud.google.com/ai-platform/training/docs/distributed-training-details#chief-versus-master job_spec["trainingInput"]["useChiefInTfConfig"] = "True" # If job_id is provided overwrite the job_id value. if job_id: job_spec["job_id"] = job_id _submit_job(job_spec, project_id) return job_spec["job_id"]