Beispiel #1
0
    def testMaxCheckpointVersions(self):
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testMaxCheckpointVersions")
            os.makedirs(chkp_dir)
            # Save checkpoints every 2 steps, and keep 5 checkpoints at most
            checkpointer = CheckpointService(chkp_dir, 2, 5, False)
            self.assertTrue(checkpointer.is_enabled())

            batch_size = 2
            # Launch the training
            arguments = [
                "--worker_id",
                1,
                "--job_type",
                JobType.TRAINING_ONLY,
                "--minibatch_size",
                batch_size,
                "--model_zoo",
                _model_zoo_path,
                "--model_def",
                "test_module.custom_model",
            ]
            args = parse_worker_args(arguments)
            worker = Worker(args)

            filename = create_recordio_file(128, DatasetName.TEST_MODULE, 1)
            task_d = _TaskDispatcher({filename: (0, 128)}, {}, {},
                                     records_per_task=64,
                                     num_epochs=1)
            master = MasterServicer(
                2,
                batch_size,
                worker._opt_fn(),
                task_d,
                init_var=worker._model.trainable_variables,
                checkpoint_filename_for_init="",
                checkpoint_service=checkpointer,
                evaluation_service=None,
            )

            worker._stub = InProcessMaster(master)
            worker.run()

            # We should have 5 checkpoints when the training finishes
            checkpoint_files = sorted(os.listdir(checkpointer._directory))
            self.assertEqual(
                checkpoint_files,
                [
                    "model_v24.chkpt",
                    "model_v26.chkpt",
                    "model_v28.chkpt",
                    "model_v30.chkpt",
                    "model_v32.chkpt",
                ],
            )
            # Latest version should be 32
            self.assertEqual(32, checkpointer.get_latest_checkpoint_version())
            # Check all checkpoints
            for version in [24, 26, 28, 30, 32]:
                model = checkpointer.get_checkpoint_model(version)
                self.assertEqual(version, model.version)
            # Checkpoint not found
            self.assertRaisesRegex(
                RuntimeError,
                "Failed to read model checkpoint from file",
                checkpointer.get_checkpoint_model,
                100,
            )
Beispiel #2
0
class MasterServicer(elasticdl_pb2_grpc.MasterServicer):
    """Master service implementation"""

    def __init__(
        self,
        grads_to_wait,
        minibatch_size,
        optimizer,
        task_d,
        *,
        init_var,
        checkpoint_filename_for_init,
        checkpoint_service,
        evaluation_service,
        embedding_service_endpoint=None,
        embedding_dims={},
        lr_staleness_modulation=False,
        use_async=False,
    ):
        # TODO: group params together into a single object.
        self._task_d = task_d
        self._lock = threading.Lock()
        self._gradient_sum = {}
        self._edl_embedding_gradients = {}
        self._gradient_sum_indexed = {}
        self._grad_to_wait = grads_to_wait
        self._grad_n = 0
        self._minibatch_size = minibatch_size
        self._use_async = use_async
        self._lr_staleness_modulation = lr_staleness_modulation

        # A <string, tf.ResourceVariable> map. We use tf.ResourceVariable
        # instead ndarray to avoid copying and conversion when calling
        # optimizer's apply_gradients() function.
        self._model = {}
        self._version = 0
        self._embedding_service_endpoint = embedding_service_endpoint
        self._init_model(checkpoint_filename_for_init, init_var)
        self._opt = self._init_optimizer(
            optimizer, embedding_service_endpoint, embedding_dims, use_async
        )

        self._checkpoint_service = checkpoint_service
        self._evaluation_service = evaluation_service
        if evaluation_service:
            evaluation_service.set_master_servicer(self)

    # TODO: Multiple tests are currently using the function `set_model_var` to
    # initialize self._model, where the initialization should be done via
    # servicer's constructor.
    def set_model_var(self, name, value):
        """Add or set model variable. Value should be a float32 ndarray"""
        if value.dtype != np.float32:
            raise ValueError("Value should be a float32 numpy array")
        self._model[name] = tf.Variable(
            value, name=MasterServicer.var_name_encode(name)
        )

    def _modulate_lr_if_needed(self, opt):
        if self._use_async and self._lr_staleness_modulation:
            self._lr_modulation = add_lr_modulation_to_optimizer(opt)
        else:
            self._lr_modulation = None

    def _init_model_from_var_list(self, var_list):
        for var in var_list:
            self.set_model_var(var.name, var.numpy())

    def _init_model_from_tensor_dict(self, tensor_dict):
        assert tensor_dict
        for name, val in tensor_dict.items():
            self.set_model_var(name, tensor_to_ndarray(val))

    def _init_model(self, checkpoint_filename_for_init, init_var):
        if checkpoint_filename_for_init:
            pb_model = load_from_checkpoint_file(checkpoint_filename_for_init)
            self._version = pb_model.version
            self._init_model_from_tensor_dict(pb_model.param)
        elif init_var:
            self._init_model_from_var_list(init_var)
        else:
            logger.info(
                "Model is not intialized. It will be "
                "initialized by the first update from "
                "the worker."
            )

    def _init_optimizer(
        self, opt, embedding_service_endpoint, embedding_dims, use_async
    ):
        # `embedding_service_endpoint` is not None means ElasticDL embedding
        # layers are used
        self._modulate_lr_if_needed(opt)
        if embedding_service_endpoint:
            return OptimizerWrapper(
                opt, embedding_service_endpoint, embedding_dims, use_async
            )
        return opt

    @staticmethod
    def var_name_encode(name):
        return name.replace(":", "-")

    def GetTask(self, request, _):
        res = elasticdl_pb2.Task()
        res.model_version = self._version
        res.minibatch_size = self._minibatch_size
        task_id, task = self._task_d.get(request.worker_id)
        if task:
            res.task_id = task_id
            res.shard_name = task.shard_name
            res.start = task.start
            res.end = task.end
            res.type = task.type
            # For evaluation task, it will use the fixed version model
            if task.type == elasticdl_pb2.EVALUATION:
                res.model_version = task.model_version
        elif not self._task_d.finished():
            # Not all tasks are finished, wait in case of new tasks later.
            res.type = elasticdl_pb2.WAIT
        return res

    def GetModel(self, request, _):
        if not self._use_async:
            self._validate_model_version(request.version)

        if (
            request.method == elasticdl_pb2.MINIMUM
            or request.version == self._version
        ):
            if self._use_async:
                res = self._get_model_no_lock()
            else:
                with self._lock:
                    res = self._get_model_no_lock()
            return res

        # Read from checkpoint for the fixed version model
        pb_model = elasticdl_pb2.Model()
        try:
            pb_model = self._checkpoint_service.get_checkpoint_model(
                request.version
            )
        except Exception:
            logger.error(
                "Failed to fetch checkpoint model for "
                "model version {}".format(request.version)
            )
        return pb_model

    def _update_model_version(self):
        assert self._lock.locked()
        self._version += 1

    def _update_edl_embedding_table(self, name_var_list):
        """
            Put updated embedding vectors' ids and values together
            and use EmbeddingService.update_embedding() to update
            embedding table in the distributed storage
        """
        keys = []
        embeddings = []
        for layer_name, unique_ids, embedding_var in name_var_list:
            keys.extend(
                [
                    Embedding.get_key([layer_name, i])
                    for i in unique_ids.numpy()
                ]
            )
            embeddings.extend([i for i in embedding_var.numpy()])

        if embeddings:
            EmbeddingService.update_embedding(
                keys=keys,
                embedding_vectors=embeddings,
                embedding_service_endpoint=self._embedding_service_endpoint,
            )

    def _update_model(self):
        grad_var = []

        # (grad, var) pairs excluding keras Embedding layer and
        # ElasticDL Embedding layer
        for k in self._gradient_sum:
            if not self._use_async:
                self._gradient_sum[k] = (
                    self._gradient_sum[k] / self._grad_to_wait
                )
            grad_var.append((self._gradient_sum[k], self._model[k]))

        # (grad, var) pair of Keras Embedding layer
        for k in self._gradient_sum_indexed:
            grad_var.append((self._gradient_sum_indexed[k], self._model[k]))

        # (grad, var) pair of ElasticDL Embedding layer
        if self._edl_embedding_gradients:
            for layer_name, grads in self._edl_embedding_gradients.items():
                grad_var.append((grads, layer_name))

        self._opt.apply_gradients(grad_var)

        # need the lock for model version update in async SGD
        if self._use_async:
            self._lock.acquire()
        self._update_model_version()
        self._update_evaluation()
        self._update_checkpoint()
        if self._use_async:
            self._lock.release()
        else:
            self._gradient_sum.clear()
            self._gradient_sum_indexed.clear()
            self._edl_embedding_gradients.clear()
            self._grad_n = 0

    def get_model_version(self):
        return self._version

    def _save_checkpoint(self, locking, is_eval_checkpoint):
        try:
            logger.info(
                "Saving checkpoint for model version %d" % self._version
            )
            if locking:
                self._lock.acquire()
            pb_model = self._get_model_no_lock()
            self._checkpoint_service.save(
                self._version, pb_model, is_eval_checkpoint
            )
            checkpoint_version = self._version
            if locking:
                self._lock.release()
            return checkpoint_version
        except Exception:
            logger.error(
                "Failed to save checkpoint file for model version %d"
                % self._version
            )

    def save_latest_checkpoint(self, output_path):
        if self._checkpoint_service is None:
            self._checkpoint_service = CheckpointService(
                checkpoint_dir="",
                checkpoint_steps=1,
                keep_checkpoint_max=1,
                include_evaluation=False,
            )
        self._save_checkpoint(locking=False, is_eval_checkpoint=False)
        checkpoint_path = self._checkpoint_service.get_checkpoint_path(
            self._checkpoint_service.get_latest_checkpoint_version()
        )
        copy_if_not_exists(checkpoint_path, output_path, is_dir=False)

    def _update_evaluation(self):
        if self._evaluation_service:
            self._evaluation_service.add_evaluation_task_if_needed(
                master_locking=False
            )

    def _update_checkpoint(self):
        if (
            self._checkpoint_service
            and self._checkpoint_service.need_to_checkpoint(self._version)
        ):
            self._save_checkpoint(locking=False, is_eval_checkpoint=False)

    def _get_model_no_lock(self):
        pb_model = elasticdl_pb2.Model()
        pb_model.version = self._version
        for k, v in self._model.items():
            pb_model.param[k].CopyFrom(ndarray_to_tensor(v.numpy()))
        return pb_model

    def _validate_model_version(self, request_model_version):
        if request_model_version > self._version:
            err_msg = (
                "Model version %d not available yet, "
                "current version: %d" % (request_model_version, self._version)
            )
            logger.warning(err_msg)
            raise ValueError(err_msg)
        return request_model_version == self._version

    def ReportVariable(self, request, _):
        with self._lock:
            if not self._model:
                self._init_model_from_tensor_dict(request.variable)
        return empty_pb2.Empty()

    def ReportGradient(self, request, _):
        model_version_valid = self._use_async or self._validate_model_version(
            request.model_version
        )

        res = elasticdl_pb2.ReportGradientResponse()
        if not model_version_valid:
            logger.warning(
                "Task result for outdated version %d dropped",
                request.model_version,
            )
            res.accepted = False
            res.model_version = self._version
            return res

        tmp = {}
        indexed_grads = {}
        edl_embedding_gradients = {}
        # Do sanity check before accumulating gradients.
        for k, v in request.gradient.items():
            if k not in self._model:
                if v.indices:
                    # grads of ElasticDL Embedding layer
                    # TODO: check arr.shape[1] = embedding_dim of this
                    # EdlEmbedding layer
                    arr = tensor_to_ndarray(v)
                    edl_embedding_gradients[k] = arr
                    continue
                else:
                    raise ValueError(
                        "Gradient key: %s is not part of model", k
                    )

            arr = tensor_to_ndarray(v)
            if isinstance(arr, tf.IndexedSlices):
                if arr.values.shape[1] != self._model[k].numpy().shape[1]:
                    raise ValueError(
                        "Gradient key: %s has incompatible "
                        "indexed slice dimension %d, expected %d"
                        % (
                            k,
                            arr.values.shape[1],
                            self._model[k].numpy().shape[1],
                        )
                    )

                max_index = tf.math.reduce_max(arr.indices).numpy()
                if max_index >= self._model[k].numpy().shape[0]:
                    raise ValueError(
                        "Gradient key: %s has wrong indices %d, "
                        "out of range %d"
                        % (k, max_index, self._model[k].numpy().shape[0] - 1)
                    )
                indexed_grads[k] = arr
            else:
                if arr.shape != self._model[k].numpy().shape:
                    raise ValueError(
                        "Gradient key: %s has incompatible dimension", k
                    )
                tmp[k] = arr

        if not self._use_async:
            self._lock.acquire()
        self._process_gradients(
            edl_embedding_gradients, indexed_grads, tmp, request.model_version
        )
        if not self._use_async:
            self._lock.release()

        res.accepted = True
        res.model_version = self._version
        return res

    def _process_gradients(
        self, edl_embedding_gradients, indexed_grads, grads, request_version
    ):
        if not self._use_async:
            # grads of ElasticDL Embedding layer
            for k, v in edl_embedding_gradients.items():
                if k in self._edl_embedding_gradients:
                    self._edl_embedding_gradients[k] = merge_indexed_slices(
                        self._edl_embedding_gradients[k], v
                    )
                else:
                    self._edl_embedding_gradients[k] = v

            # grads of Keras Embedding layer
            for k, v in indexed_grads.items():
                if k not in self._gradient_sum_indexed:
                    self._gradient_sum_indexed[k] = v
                else:
                    grads_s = self._gradient_sum_indexed[k]
                    self._gradient_sum_indexed[k] = merge_indexed_slices(
                        grads_s, v
                    )

            # other grads
            for k, v in grads.items():
                if not self._use_async and k in self._gradient_sum:
                    self._gradient_sum[k] = self._gradient_sum[k] + v
                else:
                    self._gradient_sum[k] = v
            self._grad_n += 1
        else:
            # TODO: do not accumulate gradients but apply directly.
            pass

        # staleness-aware learning rate modulation
        if self._lr_modulation:
            staleness = max(1, self._version - request_version)
            self._lr_modulation.set_multiplier(1.0 / staleness)
        if self._use_async or self._grad_n >= self._grad_to_wait:
            self._update_model()

    def ReportTaskResult(self, request, _):
        if request.err_message:
            logger.warning("Worker reported error: " + request.err_message)
            self._task_d.report(request.task_id, False)
        else:
            self._task_d.report(request.task_id, True)
        return empty_pb2.Empty()

    def ReportEvaluationMetrics(self, request, _):
        report_metrics = self._evaluation_service.report_evaluation_metrics(
            request.model_version, request.evaluation_metrics
        )
        res = elasticdl_pb2.ReportEvaluationMetricsResponse()
        res.model_version = self._version
        res.accepted = report_metrics
        return res
Beispiel #3
0
class MasterServicer(elasticdl_pb2_grpc.MasterServicer):
    """Master service implementation"""
    def __init__(
        self,
        grads_to_wait,
        minibatch_size,
        optimizer,
        task_d,
        *,
        init_var,
        checkpoint_filename_for_init,
        checkpoint_service,
        evaluation_service,
        lr_staleness_modulation=False,
        use_async=False,
    ):
        # TODO: group params together into a single object.
        self._task_d = task_d
        self._lock = threading.Lock()
        self._gradient_sum = {}
        self._edl_embedding_gradients = {}
        self._gradient_sum_indexed = {}
        self._grad_to_wait = grads_to_wait
        self._grad_n = 0
        self._minibatch_size = minibatch_size
        self._use_async = use_async
        self._lr_staleness_modulation = lr_staleness_modulation

        # A <string, tf.ResourceVariable> map. We use tf.ResourceVariable
        # instead of ndarray to avoid copying and conversion when calling
        # optimizer's apply_gradients() function.
        self._model = {}
        self._version = 0
        self._init_model(checkpoint_filename_for_init, init_var)
        self._opt = self._init_optimizer(optimizer, use_async)

        self._checkpoint_service = checkpoint_service
        self._evaluation_service = evaluation_service
        if evaluation_service:
            evaluation_service.set_master_servicer(self)

    # TODO: Multiple tests are currently using the function `set_model_var` to
    # initialize self._model, where the initialization should be done via
    # servicer's constructor.
    def set_model_var(self, name, value):
        """Add or set model variable. Value should be a float32 ndarray"""
        if value.dtype != np.float32:
            raise ValueError("Value should be a float32 numpy array")
        self._model[name] = tf.Variable(
            value, name=MasterServicer.var_name_encode(name))

    def _modulate_lr_if_needed(self, opt):
        if self._use_async and self._lr_staleness_modulation:
            self._lr_modulation = add_lr_modulation_to_optimizer(opt)
        else:
            self._lr_modulation = None

    def _init_model_from_var_list(self, var_list):
        for var in var_list:
            self.set_model_var(var.name, var.numpy())

    def _init_model_from_tensor_pb_list(self, tensor_pb_list):
        assert tensor_pb_list
        for pb in tensor_pb_list:
            self.set_model_var(pb.name, tensor_pb_to_ndarray(pb))

    def _init_model(self, checkpoint_filename_for_init, init_var):
        if checkpoint_filename_for_init:
            pb_model = load_from_checkpoint_file(checkpoint_filename_for_init)
            self._version = pb_model.version
            self._init_model_from_tensor_pb_list(pb_model.param)
        elif init_var:
            self._init_model_from_var_list(init_var)
        else:
            logger.info("Model is not intialized. It will be "
                        "initialized by the first update from "
                        "the worker.")

    # TODO: remove this function
    def _init_optimizer(self, opt, use_async):
        self._modulate_lr_if_needed(opt)
        if opt:
            return OptimizerWrapper(opt, use_async)
        return opt

    @staticmethod
    def var_name_encode(name):
        return name.replace(":", "-")

    def GetTask(self, request, _):
        res = elasticdl_pb2.Task()
        res.model_version = self._version
        res.minibatch_size = self._minibatch_size
        if request.task_type == elasticdl_pb2.EVALUATION:
            task_id, task = self._task_d.get_eval_task(request.worker_id)
        else:
            task_id, task = self._task_d.get(request.worker_id)

        if task:
            res.task_id = task_id
            res.shard_name = task.shard_name
            res.start = task.start
            res.end = task.end
            res.type = task.type
            for k, v in task.extended_config.items():
                res.extended_config[k] = v

            # For evaluation task, it will use the fixed version model
            if task.type == elasticdl_pb2.EVALUATION:
                res.model_version = task.model_version
        elif (not self._task_d.finished()) or (
                self._task_d.invoke_deferred_callback()):
            # If the todo and doing tasks are not empty,
            # Otherwise if the callback list is not empty,
            # we are trying to pop and invoke the callback.
            # Then the master tells the worker to wait
            # in case of new tasks later.
            res.type = elasticdl_pb2.WAIT

        return res

    def GetModel(self, request, _):
        if not self._use_async:
            self._validate_model_version(request.version)

        if (request.method == elasticdl_pb2.MINIMUM
                or request.version == self._version):
            if self._use_async:
                res = self._get_model_no_lock()
            else:
                with self._lock:
                    res = self._get_model_no_lock()
            return res

        # Read from checkpoint for the fixed version model
        pb_model = elasticdl_pb2.Model()
        try:
            pb_model = self._checkpoint_service.get_checkpoint_model(
                request.version)
        except Exception:
            logger.error("Failed to fetch checkpoint model for "
                         "model version {}".format(request.version))
        return pb_model

    def get_model_version(self):
        return self._version

    def _save_checkpoint(self, locking, is_eval_checkpoint):
        try:
            logger.info("Saving checkpoint for model version %d" %
                        self._version)
            if locking:
                self._lock.acquire()
            pb_model = self._get_model_no_lock()
            self._checkpoint_service.save(self._version, pb_model,
                                          is_eval_checkpoint)
            checkpoint_version = self._version
            if locking:
                self._lock.release()
            return checkpoint_version
        except Exception:
            logger.error(
                "Failed to save checkpoint file for model version %d" %
                self._version)

    def save_latest_checkpoint(self, output_path):
        if self._checkpoint_service is None:
            self._checkpoint_service = CheckpointService(
                checkpoint_dir="",
                checkpoint_steps=1,
                keep_checkpoint_max=1,
                include_evaluation=False,
            )
        self._save_checkpoint(locking=False, is_eval_checkpoint=False)
        checkpoint_path = self._checkpoint_service.get_checkpoint_path(
            self._checkpoint_service.get_latest_checkpoint_version())
        copy_if_not_exists(checkpoint_path, output_path, is_dir=False)

    def _update_evaluation(self):
        if self._evaluation_service:
            self._evaluation_service.add_evaluation_task_if_needed(
                master_locking=False, model_version=self._version)

    def _update_checkpoint(self):
        if (self._checkpoint_service and
                self._checkpoint_service.need_to_checkpoint(self._version)):
            self._save_checkpoint(locking=False, is_eval_checkpoint=False)

    def _get_model_no_lock(self):
        pb_model = elasticdl_pb2.Model()
        pb_model.version = self._version
        for k, v in self._model.items():
            emplace_tensor_pb_from_ndarray(pb_model.param, v.numpy(), name=k)
        return pb_model

    def _validate_model_version(self, request_model_version):
        if request_model_version > self._version:
            err_msg = ("Model version %d not available yet, "
                       "current version: %d" %
                       (request_model_version, self._version))
            logger.warning(err_msg)
            raise ValueError(err_msg)
        return request_model_version == self._version

    def ReportVariable(self, request, _):
        with self._lock:
            if not self._model:
                self._init_model_from_tensor_pb_list(request.variable)
        return empty_pb2.Empty()

    def ReportTaskResult(self, request, _):
        if request.err_message:
            logger.warning("Worker reported error: " + request.err_message)
            self._task_d.report(request, False)
        else:
            self._task_d.report(request, True)
        return empty_pb2.Empty()

    def ReportEvaluationMetrics(self, request, _):
        report_metrics = self._evaluation_service.report_evaluation_metrics(
            request.model_version, request.model_outputs, request.labels)
        res = elasticdl_pb2.ReportEvaluationMetricsResponse()
        res.model_version = self._version
        res.accepted = report_metrics
        return res

    def ReportVersion(self, request, _):
        if self._evaluation_service:
            self._evaluation_service.add_evaluation_task_if_needed(
                master_locking=False, model_version=request.model_version)
        return empty_pb2.Empty()