Exemple #1
0
    def test_delete_variables(self):
        params = Parameters()
        embed_layers = ["test_1", "test_2"]
        slot_names = ["m", "v"]
        dim = 8
        for layer in embed_layers:
            params.embedding_params[layer] = EmbeddingTable(layer, dim)
            for slot in slot_names:
                slot_key = get_slot_table_name(layer, slot)
                params.embedding_params[slot_key] = EmbeddingTable(
                    slot_key, dim, "0.0", True)

        opt = Adam()
        opt_wrapper = OptimizerWrapper(opt, None, params.get_embedding_param,
                                       params.set_embedding_param)

        opt_wrapper._init_thread_local()
        for name in embed_layers:
            opt_wrapper._tls._unique_ids_all_layers[name] = np.ndarray(
                [2], np.int32)
            opt_wrapper._create_embedding_variable(
                name, np.ndarray([2, dim], np.float32))
            opt_wrapper._get_slot_and_set_to_optimizer(name)

        self.assertTrue(len(opt._weights) == 4)
        self.assertTrue(len(opt._slots) == 2)
        for slot_dict in opt._slots.values():
            self.assertTrue(len(slot_dict) == 2)

        opt_wrapper._delete_slots_and_weights_in_optimizer()
        self.assertTrue(len(opt._weights) == 0)
        self.assertTrue(len(opt._slots) == 0)
Exemple #2
0
 def wrap_optimizer(self):
     self._optimizer = OptimizerWrapper(
         self._optimizer,
         self._use_async,
         self._parameters.get_embedding_param,
         self._parameters.set_embedding_param,
     )
Exemple #3
0
    def test_set_slot_to_optimizer(self):
        embed_name = "test_emb"
        indices = np.ndarray([2], dtype=np.int32)
        embed_values = np.ndarray([2, 2], dtype=np.float32)
        slot_values = {
            "m": np.ndarray([2, 2], dtype=np.float32),
            "v": np.ndarray([2, 2], dtype=np.float32),
        }
        params = Parameters()
        params.embedding_params[embed_name] = EmbeddingTable(embed_name, 8)
        for slot in ["m", "v"]:
            slot_table_name = get_slot_table_name(embed_name, slot)
            params.embedding_params[slot_table_name] = EmbeddingTable(
                slot_table_name, 2, "0.0", True)

        opt = Adam()
        opt_wrapper = OptimizerWrapper(opt, None, params.get_embedding_param)
        opt_wrapper._init_thread_local()

        opt_wrapper._tls._unique_ids_all_layers[embed_name] = indices
        opt_wrapper._create_embedding_variable(embed_name, embed_values)
        opt_wrapper._get_slot_and_set_to_optimizer(embed_name)

        self.assertEqual(len(opt._slots), 1)
        opt_slots = list(opt._slots.values())[0]
        self.assertEqual(sorted(opt_slots.keys()), ["m", "v"])
        for name in ["m", "v"]:
            self.assertTrue(
                np.allclose(opt_slots[name].numpy(), slot_values[name]))
Exemple #4
0
    def test_update_embedding_param(self):
        params = Parameters()
        for name in ["test_1", "test_2"]:
            params.embedding_params[name] = EmbeddingTable(name, 8)
            slot_key = get_slot_table_name(name, "momentum")
            params.embedding_params[slot_key] = EmbeddingTable(
                slot_key, 8, "0.0", True)

        indices = {
            "test_1": np.array([1, 5]),
            "test_2": np.array([10]),
        }
        embed_vars = {
            "test_1": tf.Variable(np.random.rand(2, 8).astype(np.float32)),
            "test_2": tf.Variable(np.random.rand(1, 8).astype(np.float32)),
        }
        slot_vars = {
            "test_1": {
                "momentum":
                tf.Variable(np.random.rand(2, 8).astype(np.float32))
            },
            "test_2": {
                "momentum":
                tf.Variable(np.random.rand(1, 8).astype(np.float32))
            },
        }

        opt = SGD(momentum=0.1)
        opt_wrapper = OptimizerWrapper(opt, None, None,
                                       params.set_embedding_param)
        opt_wrapper._tls._unique_ids_all_layers = indices
        opt_wrapper._tls._embed_variables = embed_vars
        opt_wrapper._tls._slot_variables = slot_vars
        opt_wrapper._update_embedding_param()

        for name in ["test_1", "test_2"]:
            self.assertTrue(
                np.allclose(
                    embed_vars[name].numpy(),
                    params.get_embedding_param(name, indices[name]),
                ))

            slot = "momentum"
            slot_table_name = get_slot_table_name(name, slot)
            self.assertTrue(
                np.allclose(
                    slot_vars[name][slot].numpy(),
                    params.get_embedding_param(slot_table_name, indices[name]),
                ))
Exemple #5
0
def _train_edl_embedding_with_optimizer_wrapper(model, opt_keras, X, Y,
                                                loss_fn, params, random_seed):
    """Train model with optimizer wrapper."""
    tf.random.set_seed(random_seed)
    opt_wrapper = OptimizerWrapper(
        opt_keras,
        lookup_embedding_func=params.get_embedding_param,
        update_embedding_func=params.set_embedding_param,
    )

    embed_layers = find_layer(model, Embedding)

    # initialize slot params
    params.create_slot_params(opt_wrapper.allowed_slot_names,
                              opt_wrapper.slot_initial_value)

    # initialize ElasticDL embedding layer
    for layer in embed_layers:
        layer.set_lookup_embedding_func(params.get_embedding_param)

    # training process
    for train_iter, (features, labels) in enumerate(zip(X, Y)):
        with tf.GradientTape() as tape:
            for layer in embed_layers:
                layer.set_tape(tape)
            outputs = model.call(features)
            loss = loss_fn(outputs, labels)

        # Need to get non-embedding variables inside for loop because model
        # creates variables after the first time `model.call` is called
        if not train_iter:
            non_embed_vars = get_non_embedding_trainable_vars(
                model, embed_layers)
        embed_items = []
        for layer in embed_layers:
            embed_items.extend([(bet, layer.name, ids)
                                for bet, ids in layer.embedding_and_ids])

        grads = tape.gradient(
            loss, non_embed_vars + [var for var, _, _ in embed_items])

        # TODO: do not need to merge gradient from the same embedding layer
        # after `optimizer_wrapper` support grads_and_vars with duplicated
        # layer name
        non_embed_vars_n = len(non_embed_vars)
        non_embed_grads = grads[:non_embed_vars_n]
        embed_grads_dict = {}
        for (_, layer_name, ids), grad in zip(embed_items,
                                              grads[non_embed_vars_n:]):
            if layer_name in embed_grads_dict:
                merged_grads = embed_grads_dict[layer_name]
                embed_grads_dict[layer_name] = tf.IndexedSlices(
                    tf.concat([merged_grads.values, grad.values], axis=0),
                    tf.concat([merged_grads.indices, ids], axis=0),
                )
            else:
                embed_grads_dict[layer_name] = tf.IndexedSlices(
                    grad.values, ids)

        opt_wrapper.apply_gradients(
            list(zip(non_embed_grads, non_embed_vars)) +
            [(grad, layer_name)
             for layer_name, grad in embed_grads_dict.items()])

        for layer in embed_layers:
            layer.reset()
Exemple #6
0
    def _test_async_correctness(
        self,
        grads_and_vars_batches,
        embed_values,
        expected_non_embed_values,
        expected_embed_values=None,
    ):
        """Checks the correctness of async OptimizerWrapper. This function
        creates many threads and these threads call
        `OptimizerWrapper.apply_gradients` simultaneously.

        Args:
            grads_and_vars_batches: A python list of `grads_and_vars`. Every
                thread takes a `grads_and_vars` and calls `apply_gradients`.
            embed_values: A python dictionary of
                `(layer_name, embedding table)`.
            expected_non_embed_values: A python list of expected non-embdding
                values after applying gradients.
            expected_embed_values: A python dictionary of expected embedding
                values after applying gradients. None means no need to check
                embedding values.
        """
        thread_num = len(grads_and_vars_batches)
        input_dims = {}
        embed_var_n = len(embed_values)
        params = Parameters()
        for layer, values in embed_values.items():
            embed_dim = values.shape[1]
            input_dims[layer] = values.shape[0]
            embed_table = EmbeddingTable(layer, embed_dim)
            embed_table.set(range(input_dims[layer]), values)
            params.embedding_params[layer] = embed_table

        opt = SGD(0.1)
        opt_wrapper = OptimizerWrapper(
            opt,
            True,
            lookup_embedding_func=params.get_embedding_param,
            update_embedding_func=params.set_embedding_param,
        )

        # call optimizer_wrapper.apply_gradients asynchronously
        def _apply_gradients(opt_wrapper, grads_and_vars):
            # sleep 1s to wait that all threads are in this method call
            time.sleep(1)
            opt_wrapper.apply_gradients(grads_and_vars)

        executor = ThreadPoolExecutor(max_workers=thread_num)
        tasks = [
            executor.submit(_apply_gradients, opt_wrapper, grads_and_vars)
            for grads_and_vars in grads_and_vars_batches
        ]
        _ = [task.result() for task in tasks]

        # check updated results of non-embedding variables
        non_embed_vars = [
            var for grad, var in grads_and_vars_batches[0][:-embed_var_n]
        ]
        for var, expected_value in zip(non_embed_vars,
                                       expected_non_embed_values):
            self.assertTrue(np.isclose(var.numpy(), expected_value).all())

        # `expected_embed_values=None` means that no need to check
        # embedding table
        if not expected_embed_values:
            return
        # check updated results of embedding table
        for layer, expected_values in expected_embed_values.items():
            value = params.get_embedding_param(layer, range(input_dims[layer]))

            self.assertTrue(
                any([
                    np.isclose(value, expected).all()
                    for expected in expected_values
                ]))
Exemple #7
0
 def _compare_slot_names(self, opt, expected):
     tmp = OptimizerWrapper(opt)
     self.assertTrue(sorted(tmp.allowed_slot_names) == sorted(expected))
Exemple #8
0
class PserverServicer(elasticdl_pb2_grpc.PserverServicer):
    """PS service implementation"""
    def __init__(
        self,
        parameters,
        grads_to_wait,
        optimizer,
        lr_staleness_modulation=False,
        sync_version_tolerance=0,
        use_async=False,
        evaluation_steps=0,
        master_channel=None,
        checkpoint_saver=None,
        ps_id=None,
        num_ps_pods=None,
    ):
        if master_channel is None:
            self._master_stub = None
        else:
            self._master_stub = elasticdl_pb2_grpc.MasterStub(master_channel)

        self._parameters = parameters
        self._grads_to_wait = grads_to_wait
        self._optimizer = optimizer
        self._lr_staleness_modulation = lr_staleness_modulation
        self._sync_version_tolerance = sync_version_tolerance
        self._use_async = use_async
        self._eval_steps = evaluation_steps
        self._checkpoint_saver = checkpoint_saver
        self._ps_id = ps_id
        self._num_ps_pods = num_ps_pods
        self._version_lock = threading.Lock()
        self._lock = threading.Lock()
        self._use_wrap_opt = False

        self._grads_n = 0
        self._grads_buffer = {}

    def pull_dense_parameters(self, request, _):
        """
        Response with all non-embedding parameters if initialized.
        """
        res = elasticdl_pb2.PullDenseParametersResponse()
        if not self._parameters.initialized:
            res.initialized = False
            return res

        # Only sync-SGD needs lock
        # TODO: use a read-write lock to support multiple concurrent reads
        if not self._use_async:
            self._lock.acquire()
        res.version = self._parameters.version
        # No need to send variables if the requester has the latest version.
        if self._parameters.version > request.version:
            for name, var in self._parameters.non_embedding_params.items():
                serialize_ndarray(var.numpy(), res.dense_parameters[name])
        if not self._use_async:
            self._lock.release()
        res.initialized = True
        return res

    def pull_embedding_vectors(self, request, _):
        result = tensor_pb2.TensorProto()
        if not request.ids:
            return result
        embedding_vectors = self._parameters.get_embedding_param(
            request.name, request.ids)
        serialize_ndarray(embedding_vectors, result)
        return result

    def push_model(self, request, _):
        with self._lock:
            accepted = self._parameters.init_from_model_pb(request)
        if accepted and self._parameters.has_embedding_params():
            self.wrap_optimizer_and_set_slot()
        return empty_pb2.Empty()

    def push_embedding_table_infos(self, request, _):
        with self._lock:
            self._parameters.init_embedding_params(
                request.embedding_table_infos)
            self.wrap_optimizer_and_set_slot()
        return empty_pb2.Empty()

    def push_gradients(self, request, _):
        res = elasticdl_pb2.PushGradientsResponse()
        if self._use_async:
            grad_vars = []

            for name, pb in request.gradients.dense_parameters.items():
                grad = pb_to_ndarray(pb)
                self._parameters.check_grad(Tensor(name, grad, None))
                grad = tf.constant(grad)
                var = self._parameters.get_non_embedding_param(name)
                grad_vars.append((grad, var))

            for name, pb in request.gradients.embedding_tables.items():
                grad = pb_to_indexed_slices(pb)
                self._parameters.check_grad(
                    Tensor(name, grad.values, grad.indices))
                if name in self._parameters.non_embedding_params:
                    var = self._parameters.get_non_embedding_param(name)
                    grad_vars.append((grad, var))
                else:
                    grad_vars.append((grad, name))

            learning_rate = request.learning_rate
            # TODO: if request.learning_rate == 0.0, modulate learning_rate
            #       in self._optimizer with staleness
            if self._lr_staleness_modulation and learning_rate > 0.0:
                staleness = max(
                    1, self._parameters.version - request.gradients.version)
                # Modulate learning rate by staleness
                learning_rate /= staleness

            self._set_optimizer_learning_rate(learning_rate)
            self._optimizer.apply_gradients(grad_vars)
            with self._version_lock:
                self._parameters.version += 1
                self._save_params_to_checkpoint_if_needed()
                version = self._parameters.version
            self._report_version_if_needed(version)

            res.accepted = True
            res.version = self._parameters.version
            return res
        else:
            if (request.gradients.version <
                    self._parameters.version - self._sync_version_tolerance):
                res.accepted = False
                res.version = self._parameters.version
                return res

            with self._lock:
                for name, pb in request.gradients.dense_parameters.items():
                    grad = pb_to_ndarray(pb)
                    self._parameters.check_grad(Tensor(name, grad, None))
                    if name in self._grads_buffer:
                        self._grads_buffer[name] = (self._grads_buffer[name] +
                                                    grad)
                    else:
                        self._grads_buffer[name] = grad

                for name, pb in request.gradients.embedding_tables.items():
                    grad = pb_to_indexed_slices(pb)
                    self._parameters.check_grad(
                        Tensor(name, grad.values, grad.indices))
                    if name in self._grads_buffer:
                        self._grads_buffer[name] = merge_indexed_slices(
                            self._grads_buffer[name], grad)
                    else:
                        self._grads_buffer[name] = grad

                self._grads_n += 1
                res.accepted = True

                updated_version = False
                version = self._parameters.version
                if self._grads_n == self._grads_to_wait:
                    grad_vars = []
                    for name, grad in self._grads_buffer.items():
                        # Dense gradients are averaged,
                        # while sparse gradients are summed
                        if not isinstance(grad, tf.IndexedSlices):
                            grad = grad / self._grads_to_wait
                            grad = tf.constant(grad)
                        var = self._parameters.get_non_embedding_param(name)
                        if var is None:
                            grad_vars.append((grad, name))
                        else:
                            grad_vars.append((grad, var))

                    self._set_optimizer_learning_rate(request.learning_rate)
                    self._optimizer.apply_gradients(grad_vars)
                    self._grads_n = 0
                    self._grads_buffer.clear()
                    self._parameters.version += 1
                    self._save_params_to_checkpoint_if_needed()
                    version = self._parameters.version
                    updated_version = True

            if updated_version:
                self._report_version_if_needed(version)
            res.version = version
            return res

    def wrap_optimizer(self):
        self._optimizer = OptimizerWrapper(
            self._optimizer,
            self._use_async,
            self._parameters.get_embedding_param,
            self._parameters.set_embedding_param,
        )

    def _report_version_if_needed(self, version):
        if self._eval_steps and version % self._eval_steps == 0:
            self._report_version(version)

    def _report_version(self, version):
        req = elasticdl_pb2.ReportVersionRequest()
        req.model_version = version
        self._master_stub.report_version(req)

    def wrap_optimizer_and_set_slot(self):
        if not self._use_wrap_opt:
            self.wrap_optimizer()
            self._parameters.create_slot_params(
                self._optimizer.allowed_slot_names,
                self._optimizer.slot_initial_value,
            )
            self._use_wrap_opt = True

    def _save_params_to_checkpoint_if_needed(self):
        """Save a checkpoint of parameters to a protobuf file"""
        if (self._checkpoint_saver and
                self._parameters.version % self._checkpoint_saver._steps == 0):
            model_pb = self._parameters.to_model_pb()

            logger.info("Save checkpoint for version %s" % model_pb.version)
            self._checkpoint_saver.save(
                model_pb.version,
                model_pb,
                is_eval_checkpoint=False,
                shard_index=self._ps_id,
                shard_num=self._num_ps_pods,
            )

    def _set_optimizer_learning_rate(self, learning_rate):
        if learning_rate == 0.0:
            return

        if self._use_wrap_opt:
            self._optimizer.set_learning_rate(learning_rate)
        else:
            K.set_value(self._optimizer.lr, K.get_value(learning_rate))