def test_job_id(self, mock_serialize_assets, mock_submit_job):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            with self.assertRaises(RuntimeError):
                client.cloud_fit(
                    self._model,
                    x=self._dataset,
                    validation_data=self._dataset,
                    remote_dir=self._remote_dir,
                    job_spec=self._job_spec,
                    batch_size=1,
                    epochs=2,
                    verbose=3,
                )
            return

        test_job_id = "test_job_id"
        client.cloud_fit(
            self._model,
            x=self._dataset,
            validation_data=self._dataset,
            remote_dir=self._remote_dir,
            job_spec=self._job_spec,
            job_id=test_job_id,
            batch_size=1,
            epochs=2,
            verbose=3,
        )

        kargs, _ = mock_submit_job.call_args
        body, _ = kargs
        self.assertDictContainsSubset({
            "job_id": test_job_id,
        }, body)
    def test_distribution_strategy(self, mock_serialize_assets,
                                   mock_submit_job):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            with self.assertRaises(RuntimeError):
                client.cloud_fit(self._model,
                                 x=self._dataset,
                                 remote_dir=self._remote_dir)
            return

        client.cloud_fit(self._model,
                         x=self._dataset,
                         remote_dir=self._remote_dir)

        kargs, _ = mock_submit_job.call_args
        body, _ = kargs
        self.assertDictContainsSubset(
            {
                "args": [
                    "--remote_dir",
                    self._remote_dir,
                    "--distribution_strategy",
                    MULTI_WORKER_MIRRORED_STRATEGY_NAME,
                ],
            },
            body["trainingInput"],
        )

        client.cloud_fit(
            self._model,
            x=self._dataset,
            remote_dir=self._remote_dir,
            distribution_strategy=MIRRORED_STRATEGY_NAME,
            job_spec=self._job_spec,
        )

        kargs, _ = mock_submit_job.call_args
        body, _ = kargs
        self.assertDictContainsSubset(
            {
                "args": [
                    "--remote_dir",
                    self._remote_dir,
                    "--distribution_strategy",
                    MIRRORED_STRATEGY_NAME,
                ],
            },
            body["trainingInput"],
        )

        with self.assertRaises(ValueError):
            client.cloud_fit(
                self._model,
                x=self._dataset,
                remote_dir=self._remote_dir,
                distribution_strategy="not_implemented_strategy",
                job_spec=self._job_spec,
            )
Esempio n. 3
0
def run(remote_dir: Text, distribution_strategy_text: Text) -> None:
    """deserializes Model and Dataset and runs them.

    Args:
        remote_dir: Temporary cloud storage folder that contains model and
            Dataset graph. This folder is also used for job output.
        distribution_strategy_text: Specifies the distribution strategy for
            remote execution when a jobspec is provided. Accepted values are
            strategy names as specified by 'tf.distribute.<strategy>.__name__'.
    """
    logging.info("Setting distribution strategy to %s",
                 distribution_strategy_text)

    distribution_strategy = SUPPORTED_DISTRIBUTION_STRATEGIES[
        distribution_strategy_text]()

    with distribution_strategy.scope():
        if cloud_fit_utils.is_tf_v1():
            training_assets_graph = tf.compat.v2.saved_model.load(
                export_dir=os.path.join(remote_dir, "training_assets"),
                tags=None)
        else:
            training_assets_graph = tf.saved_model.load(
                os.path.join(remote_dir, "training_assets"))

        fit_kwargs = {}
        if hasattr(training_assets_graph, "fit_kwargs_fn"):
            fit_kwargs = tfds.as_numpy(training_assets_graph.fit_kwargs_fn())
            logging.info("fit_kwargs were loaded successfully.")

        if hasattr(training_assets_graph, "x_fn"):
            fit_kwargs["x"] = training_assets_graph.x_fn()
            logging.info("x was loaded successfully.")

        if hasattr(training_assets_graph, "y_fn"):
            fit_kwargs["y"] = training_assets_graph.y_fn()
            logging.info("y was loaded successfully.")

        if hasattr(training_assets_graph, "validation_data_fn"):
            fit_kwargs["validation_data"] = (
                training_assets_graph.validation_data_fn())

        if hasattr(training_assets_graph, "callbacks_fn"):
            pickled_callbacks = tfds.as_numpy(
                training_assets_graph.callbacks_fn())
            fit_kwargs["callbacks"] = pickle.loads(pickled_callbacks)
            logging.info("callbacks were loaded successfully.")

        model = tf.keras.models.load_model(os.path.join(remote_dir, "model"))
        logging.info("Model was loaded from %s successfully.",
                     os.path.join(remote_dir, "model"))
        model.fit(**fit_kwargs)
Esempio n. 4
0
    def test_run(self):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            return

        remote.run(self._remote_dir, MIRRORED_STRATEGY_NAME)
        self.assertGreaterEqual(len(tf.io.gfile.listdir(self._output_dir)), 1)
        self.assertGreaterEqual(len(tf.io.gfile.listdir(self._logs_dir)), 1)

        model = tf.keras.models.load_model(self._output_dir)

        # Test saved model load and works properly
        self.assertGreater(
            model.evaluate(self._x, self._y)[0],
            np.array([0.0], dtype=np.float32))
Esempio n. 5
0
    def test_custom_callback(self):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            return

        # Setting up custom callback with mock calls
        _MockCallable.reset()

        self._fit_kwargs["callbacks"] = [CustomCallbackExample()]
        client._serialize_assets(self._remote_dir, self._model,
                                 **self._fit_kwargs)

        # Verify callback function has not been called yet.
        _MockCallable.mock_callable.assert_not_called()

        remote.run(self._remote_dir, MIRRORED_STRATEGY_NAME)
        # Verifying callback functions triggered properly
        _MockCallable.mock_callable.assert_called_once_with()
    def test_custom_job_spec(self, mock_submit_job):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            with self.assertRaises(RuntimeError):
                client.cloud_fit(
                    self._model,
                    x=self._dataset,
                    validation_data=self._dataset,
                    remote_dir=self._remote_dir,
                    job_spec=self._job_spec,
                    batch_size=1,
                    epochs=2,
                    verbose=3,
                )
            return

        client.cloud_fit(
            self._model,
            x=self._dataset,
            validation_data=self._dataset,
            remote_dir=self._remote_dir,
            job_spec=self._job_spec,
            batch_size=1,
            epochs=2,
            verbose=3,
        )

        kargs, _ = mock_submit_job.call_args
        body, _ = kargs
        self.assertDictContainsSubset(
            {
                "masterConfig": {
                    "imageUri": self._image_uri,
                },
                "args": [
                    "--remote_dir",
                    self._remote_dir,
                    "--distribution_strategy",
                    MULTI_WORKER_MIRRORED_STRATEGY_NAME,
                ],
            },
            body["trainingInput"],
        )
Esempio n. 7
0
    def test_in_memory_data(self):
        # This test should only run in tf 2.x
        if utils.is_tf_v1():
            return

        # Create a folder under remote dir for this test's data
        tmp_folder = str(uuid.uuid4())
        remote_dir = os.path.join(self._remote_dir, tmp_folder)

        # Keep track of test folders created for final clean up
        self._test_folders.append(remote_dir)

        x = np.random.random((2, 3))
        y = np.random.randint(0, 2, (2, 2))

        job_id = client.cloud_fit(
            self._model(),
            x=x,
            y=y,
            remote_dir=remote_dir,
            region=self._region,
            project_id=self._project_id,
            image_uri=self._image_uri,
            job_id="cloud_fit_e2e_test_{}_{}".format(
                _BUILD_ID.replace("-", "_"), "test_in_memory_data"),
            epochs=2,
        )
        logging.info("test_in_memory_data submitted with job id: %s", job_id)

        # Wait for AIP Training job to finish successfully
        self.assertTrue(
            google_api_client.wait_for_aip_training_job_completion(
                job_id, self._project_id))

        # load model from remote dir
        trained_model = tf.keras.models.load_model(
            os.path.join(remote_dir, "checkpoint"))
        eval_results = trained_model.evaluate(x, y)

        # Accuracy should be better than zero
        self.assertListEqual(trained_model.metrics_names, ["loss", "accuracy"])
        self.assertGreater(eval_results[1], 0)
Esempio n. 8
0
    def test_client_with_tf_1x_raises_error(self):
        # This test is only applicable to TF 1.x
        if not utils.is_tf_v1():
            return

        x = np.random.random((2, 3))
        y = np.random.randint(0, 2, (2, 2))

        # TF 1.x is not supported, verify proper error is raised for TF 1.x.
        with self.assertRaises(RuntimeError):
            client.cloud_fit(
                self._model(),
                x=x,
                y=y,
                remote_dir="gs://some_test_dir",
                region=self._region,
                project_id=self._project_id,
                image_uri=self._image_uri,
                epochs=2,
            )
    def test_fit_kwargs(self, mock_submit_job):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            with self.assertRaises(RuntimeError):
                client.cloud_fit(
                    self._model,
                    x=self._dataset,
                    validation_data=self._dataset,
                    remote_dir=self._remote_dir,
                    job_spec=self._job_spec,
                    batch_size=1,
                    epochs=2,
                    verbose=3,
                )
            return
        job_id = client.cloud_fit(
            self._model,
            x=self._dataset,
            validation_data=self._dataset,
            remote_dir=self._remote_dir,
            region=self._region,
            project_id=self._project_id,
            image_uri=self._image_uri,
            batch_size=1,
            epochs=2,
            verbose=3,
        )

        kargs, _ = mock_submit_job.call_args
        body, _ = kargs
        self.assertEqual(body["job_id"], job_id)
        remote_dir = body["trainingInput"]["args"][1]

        training_assets_graph = tf.saved_model.load(
            os.path.join(remote_dir, "training_assets"))
        elements = training_assets_graph.fit_kwargs_fn()
        self.assertDictContainsSubset(tfds.as_numpy(elements), {
            "batch_size": 1,
            "epochs": 2,
            "verbose": 3
        })
    def test_serialize_assets(self):
        # TF 1.x is not supported
        if utils.is_tf_v1():
            with self.assertRaises(RuntimeError):
                client.cloud_fit(
                    self._model,
                    x=self._dataset,
                    validation_data=self._dataset,
                    remote_dir=self._remote_dir,
                    job_spec=self._job_spec,
                    batch_size=1,
                    epochs=2,
                    verbose=3,
                )
            return
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=self._remote_dir)
        args = self._scalar_fit_kwargs
        args["callbacks"] = [tensorboard_callback]

        client._serialize_assets(self._remote_dir, self._model, **args)
        self.assertGreaterEqual(
            len(
                tf.io.gfile.listdir(
                    os.path.join(self._remote_dir, "training_assets"))), 1)
        self.assertGreaterEqual(
            len(tf.io.gfile.listdir(os.path.join(self._remote_dir, "model"))),
            1)

        training_assets_graph = tf.saved_model.load(
            os.path.join(self._remote_dir, "training_assets"))

        pickled_callbacks = tfds.as_numpy(training_assets_graph.callbacks_fn())
        unpickled_callbacks = pickle.loads(pickled_callbacks)
        self.assertIsInstance(unpickled_callbacks[0],
                              tf.keras.callbacks.TensorBoard)
Esempio n. 11
0
def run(remote_dir: Text, distribution_strategy_text: Text) -> None:
    """deserializes Model and Dataset and runs them.

    Args:
        remote_dir: Temporary cloud storage folder that contains model and
            Dataset graph. This folder is also used for job output.
        distribution_strategy_text: Specifies the distribution strategy for
            remote execution when a jobspec is provided. Accepted values are
            strategy names as specified by 'tf.distribute.<strategy>.__name__'.
    """
    logging.info("Setting distribution strategy to %s",
                 distribution_strategy_text)

    distribution_strategy = SUPPORTED_DISTRIBUTION_STRATEGIES[
        distribution_strategy_text]()

    with distribution_strategy.scope():
        if cloud_fit_utils.is_tf_v1():
            training_assets_graph = tf.compat.v2.saved_model.load(
                export_dir=os.path.join(remote_dir, "training_assets"),
                tags=None)
        else:
            training_assets_graph = tf.saved_model.load(
                os.path.join(remote_dir, "training_assets"))

        fit_kwargs = {}
        if hasattr(training_assets_graph, "fit_kwargs_fn"):
            # Specific fit_kwargs required for TFX tuner_fn.
            train_files = None
            eval_files = None
            transform_graph = None
            label_key = None
            train_batch_size = None
            eval_batch_size = None
            if "label_key" in training_assets_graph.fit_kwargs_fn():
                label_key_byte = tfds.as_numpy(
                    training_assets_graph.fit_kwargs_fn()["label_key"])
                label_key = label_key_byte.decode("ASCII")
            if "transform_graph_path" in training_assets_graph.fit_kwargs_fn():
                transform_graph_path = tfds.as_numpy(
                    training_assets_graph.fit_kwargs_fn()
                    ["transform_graph_path"])
                # Decode the path from byte to string object.
                transform_graph = tft.TFTransformOutput(
                    transform_graph_path.decode("ASCII"))
                logging.info("transform_graph was loaded successfully.")
            if "train_files" in training_assets_graph.fit_kwargs_fn():
                train_files_byte = tfds.as_numpy(
                    training_assets_graph.fit_kwargs_fn()["train_files"])
                train_files = [x.decode("ASCII") for x in train_files_byte]
            if "eval_files" in training_assets_graph.fit_kwargs_fn():
                eval_files_byte = tfds.as_numpy(
                    training_assets_graph.fit_kwargs_fn()["eval_files"])
                eval_files = [x.decode("ASCII") for x in eval_files_byte]

            if "train_batch_size" in training_assets_graph.fit_kwargs_fn():
                train_batch_size = tfds.as_numpy(
                    training_assets_graph.fit_kwargs_fn()["train_batch_size"])
            if "eval_batch_size" in training_assets_graph.fit_kwargs_fn():
                eval_batch_size = tfds.as_numpy(
                    training_assets_graph.fit_kwargs_fn()["eval_batch_size"])

            if train_files and transform_graph and label_key and train_batch_size:  # pylint: disable=line-too-long
                fit_kwargs["x"] = _input_fn(train_files,
                                            transform_graph,
                                            label_key,
                                            batch_size=train_batch_size)
                logging.info("x was loaded successfully.")

            if eval_files and transform_graph and label_key and eval_batch_size:
                fit_kwargs["validation_data"] = _input_fn(
                    eval_files,
                    transform_graph,
                    label_key,
                    batch_size=eval_batch_size)
                logging.info("validation data was loaded successfully.")

            for k in training_assets_graph.fit_kwargs_fn().keys():
                # Specific fit_kwargs for TFX AIP Tuner component.
                tfx_fit_kwargs = [
                    "train_files", "eval_files", "label_key",
                    "transform_graph_path", "train_batch_size",
                    "eval_batch_size"
                ]
                # deserialize the rest of the fit_kwargs
                if k not in tfx_fit_kwargs:
                    fit_kwargs[k] = tfds.as_numpy(
                        training_assets_graph.fit_kwargs_fn()[k])
            logging.info("fit_kwargs were loaded successfully.")

        if hasattr(training_assets_graph, "x_fn"):
            fit_kwargs["x"] = training_assets_graph.x_fn()
            logging.info("x was loaded successfully.")

        if hasattr(training_assets_graph, "y_fn"):
            fit_kwargs["y"] = training_assets_graph.y_fn()
            logging.info("y was loaded successfully.")

        if hasattr(training_assets_graph, "validation_data_fn"):
            fit_kwargs["validation_data"] = (
                training_assets_graph.validation_data_fn())

        if hasattr(training_assets_graph, "callbacks_fn"):
            pickled_callbacks = tfds.as_numpy(
                training_assets_graph.callbacks_fn())
            fit_kwargs["callbacks"] = pickle.loads(pickled_callbacks)
            logging.info("callbacks were loaded successfully.")

        model = tf.keras.models.load_model(os.path.join(remote_dir, "model"))
        logging.info("Model was loaded from %s successfully.",
                     os.path.join(remote_dir, "model"))
        model.fit(**fit_kwargs)
Esempio n. 12
0
def cloud_fit(model: tf.keras.Model,
              remote_dir: Text,
              region: Optional[Text] = None,
              project_id: Optional[Text] = None,
              image_uri: Optional[Text] = None,
              distribution_strategy: Text = DEFAULT_DISTRIBUTION_STRATEGY,
              job_spec: Optional[Dict[str, Any]] = None,
              job_id: Optional[Text] = None,
              **fit_kwargs) -> Text:
    """Executes in-memory Model and Dataset remotely on AI Platform.

    Args:
        model: A compiled Keras Model.
        remote_dir: Google Cloud Storage path for temporary assets and
            AI Platform training output. Will overwrite value in job_spec.
        region: Target region for running the AI Platform Training job.
        project_id: Project id where the training should be deployed to.
        image_uri: based image used to use for AI Platform Training
        distribution_strategy: Specifies the distribution strategy for remote
            execution when a jobspec is provided. Accepted values are strategy
            names as specified by 'tf.distribute.<strategy>.__name__'.
        job_spec: AI Platform Training job_spec, will take precedence over all
            other provided values except for remote_dir. If none is provided a
            default cluster spec and distribution strategy will be used.
        job_id: A name to use for the AI Platform Training job (mixed-case
            letters, numbers, and underscores only, starting with a letter).
        **fit_kwargs: Args to pass to model.fit() including training and eval
            data. Only keyword arguments are supported. Callback functions will
            be serialized as is, they must be available in run time environment.

    Returns:
        AI Platform job ID

    Raises:
        RuntimeError: If executing in graph mode, eager execution is required
            for cloud_fit.
        NotImplementedError: Tensorflow v1.x is not supported.
    """
    logging.set_verbosity(logging.INFO)

    if distribution_strategy not in SUPPORTED_DISTRIBUTION_STRATEGIES:
        raise ValueError(
            "{} is not supported. Supported Strategies are {}".format(
                distribution_strategy,
                list(SUPPORTED_DISTRIBUTION_STRATEGIES.keys()),
            ))

    if cloud_fit_utils.is_tf_v1():
        raise NotImplementedError("Tensorflow v1.x is not supported.")

    # Can only export Datasets which were created executing eagerly
    # Raise an error if eager execution is not enabled.
    if not tf.executing_eagerly():
        raise RuntimeError("Eager execution is required for cloud_fit.")

    if job_spec:
        job_spec["trainingInput"]["args"] = [
            "--remote_dir",
            remote_dir,
            "--distribution_strategy",
            distribution_strategy,
        ]

    else:
        job_spec = _default_job_spec(
            region=region,
            image_uri=image_uri,
            entry_point_args=[
                "--remote_dir",
                remote_dir,
                "--distribution_strategy",
                distribution_strategy,
            ],
        )

    _serialize_assets(remote_dir, model, **fit_kwargs)

    # Setting AI Platform Training to use chief in TF_CONFIG environment
    # variable.
    # https://cloud.google.com/ai-platform/training/docs/distributed-training-details#chief-versus-master  # pylint: disable=line-too-long
    job_spec["trainingInput"]["useChiefInTfConfig"] = "True"

    # If job_id is provided overwrite the job_id value.
    if job_id:
        job_spec["job_id"] = job_id

    _submit_job(job_spec, project_id)
    return job_spec["job_id"]
def run(remote_dir: Text, distribution_strategy_text: Text) -> None:
    """deserializes Model and Dataset and runs them.

    Args:
        remote_dir: Temporary cloud storage folder that contains model and
            Dataset graph. This folder is also used for job output.
        distribution_strategy_text: Specifies the distribution strategy for
            remote execution when a jobspec is provided. Accepted values are
            strategy names as specified by 'tf.distribute.<strategy>.__name__'.
    """
    logging.info("Setting distribution strategy to %s",
                 distribution_strategy_text)

    is_mwms = distribution_strategy_text == MULTI_WORKER_MIRRORED_STRATEGY_NAME

    distribution_strategy = SUPPORTED_DISTRIBUTION_STRATEGIES[
        distribution_strategy_text]()

    with distribution_strategy.scope():
        if cloud_fit_utils.is_tf_v1():
            training_assets_graph = tf.compat.v2.saved_model.load(
                export_dir=os.path.join(remote_dir, "training_assets"),
                tags=None)
        else:
            training_assets_graph = tf.saved_model.load(
                os.path.join(remote_dir, "training_assets"))

        fit_kwargs = {}
        if hasattr(training_assets_graph, "fit_kwargs_fn"):
            fit_kwargs = tfds.as_numpy(training_assets_graph.fit_kwargs_fn())
            logging.info("fit_kwargs were loaded successfully.")

        if hasattr(training_assets_graph, "x_fn"):
            fit_kwargs["x"] = training_assets_graph.x_fn()
            logging.info("x was loaded successfully.")

        if hasattr(training_assets_graph, "y_fn"):
            fit_kwargs["y"] = training_assets_graph.y_fn()
            logging.info("y was loaded successfully.")

        if hasattr(training_assets_graph, "validation_data_fn"):
            fit_kwargs["validation_data"] = (
                training_assets_graph.validation_data_fn())

        if hasattr(training_assets_graph, "callbacks_fn"):
            pickled_callbacks = tfds.as_numpy(
                training_assets_graph.callbacks_fn())
            fit_kwargs["callbacks"] = pickle.loads(pickled_callbacks)
            logging.info("callbacks were loaded successfully.")

        model = tf.keras.models.load_model(os.path.join(remote_dir, "model"))
        logging.info("Model was loaded from %s successfully.",
                     os.path.join(remote_dir, "model"))
        model.fit(**fit_kwargs)

    # We need to set a different directory on workers when using MWMS since we
    # will run into errors due to concurrent writes to the same directory.
    # This is a workaround for the issue described in b/148619319.
    if not _is_current_worker_chief() and is_mwms:
        tmp_worker_dir = os.path.join(
            remote_dir, "output/tmp/workers_" + str(uuid.uuid4()))
        logging.info("Saving model from worker in temporary folder %s.",
                     tmp_worker_dir)
        model.save(tmp_worker_dir)

        logging.info("Removing temporary folder %s.", tmp_worker_dir)
        _delete_dir(tmp_worker_dir)

    else:
        model.save(os.path.join(remote_dir, "output"))