Beispiel #1
0
def get_params_shard_from_pb(model_pb, shard_index, shard_num):
    """Get parameters including variables values and embedding table
    from a model protobuf.

    Args:
        model_pb: A Model protobuf instance.
        shard_index: Model shard index.
        shard_num: The total number of model shards.

    Return:
        non_embedding_vars: A Python dict in which the key is a variable
            name and the value is a `tf.Variable` object.
        embedding_table_values: A Python dict in which the key is an embedding
            table name and the value is a tuple with 2 elements. The value[0]
            is indices and value[1] is the corresponding embedding vector.
    """
    non_embedding_vars = {}
    embedding_table_values = {}

    for tensor_pb in model_pb.param:
        tensor = Tensor.from_tensor_pb(tensor_pb)
        if tensor.indices is not None:
            embedding_table_values.setdefault(tensor.name, ([], []))
            for embedding_id, vector in zip(tensor.indices, tensor.values):
                if int_to_id(embedding_id, shard_num) == shard_index:
                    embedding_table_values[tensor.name][0].append(embedding_id)
                    embedding_table_values[tensor.name][1].append(vector)
        else:
            if string_to_id(tensor.name, shard_num) == shard_index:
                non_embedding_vars[tensor.name] = tf.Variable(
                    initial_value=tensor.values, trainable=True)
    return non_embedding_vars, embedding_table_values
Beispiel #2
0
    def get_model(self):
        self._timing.start_record_time("get_model")
        variable_future_and_id_pairs = []
        if self._use_multi_ps:
            self.init_ps_var_partition()
        for ps_id, stub in enumerate(self._ps_stubs):
            if ps_id not in self._ps_vars:
                continue
            # async grpc call
            req = elasticdl_pb2.PullVariableRequest()
            req.current_model_version = self._model_versions_from_ps[ps_id]
            var_future = stub.pull_variable.future(req)
            variable_future_and_id_pairs.append((var_future, ps_id))

        for var_future, ps_id in variable_future_and_id_pairs:
            res = var_future.result()
            if not res.model_init_status:
                # push variable to ps for initialization
                self.report_variable_to_ps(ps_id)
                req = elasticdl_pb2.PullVariableRequest()
                req.current_model_version = self._model_versions_from_ps[ps_id]
                res = self._ps_stubs[ps_id].pull_variable(req)
                if not res.model_init_status:
                    # TODO: support PS fault-tolerance
                    raise RuntimeError("PS pod %d cannot be initialized" %
                                       ps_id)

            for tensor_pb in res.model.param:
                tensor = Tensor.from_tensor_pb(tensor_pb)
                self._non_embed_vars[tensor.name].assign(tensor.to_ndarray())
            self._model_versions_from_ps[ps_id] = res.model.version

        self._model_version = max(self._model_versions_from_ps)
        self._timing.end_record_time("get_model")
Beispiel #3
0
    def get_model(self):
        model_version = -1
        variable_future_and_id_pairs = []
        req = empty_pb2.Empty()
        if self._use_multi_ps:
            self.init_ps_var_partition()
        for ps_id, stub in enumerate(self._ps_stubs):
            if ps_id not in self._ps_vars:
                continue
            # async grpc call
            var_future = stub.pull_variable.future(req)
            variable_future_and_id_pairs.append((var_future, ps_id))

        for var_future, ps_id in variable_future_and_id_pairs:
            res = var_future.result()
            if not res.model_init_status:
                # push variable to ps for initialization
                self.report_variable_to_ps(ps_id)
                res = self._ps_stubs[ps_id].pull_variable(req)
                if not res.model_init_status:
                    # TODO: support PS fault-tolerance
                    raise RuntimeError("PS pod %d cannot be initialized" %
                                       ps_id)

            for tensor_pb in res.model.param:
                tensor = Tensor.from_tensor_pb(tensor_pb)
                self._non_embed_vars[tensor.name].assign(tensor.to_ndarray())

            model_version = max(model_version, res.model.version)
        self._model_version = model_version
Beispiel #4
0
 def _get_non_embedding_variables(self, version, method):
     """Get model from master, and update model_version
     """
     req = elasticdl_pb2.GetModelRequest()
     req.version = version
     req.method = method
     model = self._stub.GetModel(req, None)
     variables = {}
     for tensor_pb in model.param:
         tensor = Tensor.from_tensor_pb(tensor_pb)
         variables[tensor.name] = tensor.to_ndarray()
     return variables
Beispiel #5
0
    def get_model_from_master(self, version, method):
        """
        get model from master, and update model_version
        """
        req = elasticdl_pb2.GetModelRequest()
        req.version = version
        req.method = method
        model = self._stub.GetModel(req)

        # Assumes all trainable variables exist in model.param.
        for tensor_pb in model.param:
            tensor = Tensor.from_tensor_pb(tensor_pb)
            self._non_embed_vars[tensor.name].assign(tensor.to_ndarray())
        self._model_version = model.version
Beispiel #6
0
    def test_tensor_data_structure(self):
        # Test tensor values, without indices
        arr = np.ndarray(shape=[3, 1, 2, 4], dtype=np.int32)
        tensor = Tensor(arr)
        self.assertTrue(np.array_equal(arr, tensor.values))
        self.assertTrue(np.array_equal(arr, tensor.to_tf_tensor()))
        self.assertFalse(tensor.is_indexed_slices())

        # Test tensor values, with indices
        indices = np.array([2, 0, 1])
        tensor = Tensor(arr, indices)
        self.assertTrue(np.array_equal(arr, tensor.values))
        self.assertTrue(np.array_equal(indices, tensor.indices))
        self.assertTrue(np.array_equal(arr, tensor.to_tf_tensor().values))
        self.assertTrue(np.array_equal(indices, tensor.to_tf_tensor().indices))
        self.assertTrue(tensor.is_indexed_slices())

        # Test round trip
        # tensor to tensor PB
        tensor = Tensor(arr, indices, name="test")
        pb = tensor.to_tensor_pb()
        self.assertEqual(pb.name, "test")
        self.assertEqual(pb.dim, [3, 1, 2, 4])
        self.assertEqual(pb.dtype, tensor_dtype_pb2.DT_INT32)
        np.testing.assert_array_equal(pb.indices, indices)

        # tensor PB to tensor
        tensor_new = Tensor.from_tensor_pb(pb)
        self.assertEqual(tensor.name, "test")
        np.testing.assert_array_equal(tensor_new.values, arr)
        np.testing.assert_array_equal(tensor_new.indices, indices)

        # Test Tensor().to_ndarray()
        values = np.array([[1.0, 2.0], [3.0, 4.0]])
        indices = np.array([0, 2])
        name = "test"
        tensor = Tensor(values, indices, name)
        self.assertRaises(NotImplementedError, tensor.to_ndarray)
        tensor = Tensor(values, name=name)
        self.assertTrue(np.allclose(values, tensor.to_ndarray()))
Beispiel #7
0
    def push_gradient(self, request, _):
        res = elasticdl_pb2.PushGradientResponse()
        if self._use_async:
            grad_vars = []
            for pb in request.gradients:
                grad = Tensor.from_tensor_pb(pb)
                self._parameters.check_grad(grad)
                name = grad.name
                var = self._parameters.get_non_embedding_param(name)
                grad = grad.to_tf_tensor()
                if var is None:
                    grad_vars.append((grad, name))
                else:
                    grad_vars.append((grad, var))

            if self._lr_scheduler:
                self._lr_scheduler.set_model_version(self._parameters.version)
            self._optimizer.apply_gradients(grad_vars)
            with self._version_lock:
                self._parameters.version += 1
                self._save_params_to_checkpoint_if_needed()
                version = self._parameters.version
            self._report_version_if_needed(version)

            res.accepted = True
            res.model_version = self._parameters.version
            return res
        else:
            if (request.model_version <
                    self._parameters.version - self._sync_version_tolerance):
                res.accepted = False
                res.model_version = self._parameters.version
                return res

            with self._lock:
                for pb in request.gradients:
                    grad = Tensor.from_tensor_pb(pb)
                    self._parameters.check_grad(grad)
                    if grad.name in self._grads_buffer:
                        self._grads_buffer[grad.name] = (
                            self._grads_buffer[grad.name] + grad)
                    else:
                        self._grads_buffer[grad.name] = grad

                self._grads_n += 1
                res.accepted = True

                updated_version = False
                version = self._parameters.version
                if self._grads_n == self._grads_to_wait:
                    grad_vars = []
                    for name, grad in self._grads_buffer.items():
                        # Dense gradients are averaged,
                        # while sparse gradients are summed
                        if not grad.is_indexed_slices():
                            grad.values = grad.values / self._grads_to_wait
                        var = self._parameters.get_non_embedding_param(name)
                        grad = grad.to_tf_tensor()
                        if var is None:
                            grad_vars.append((grad, name))
                        else:
                            grad_vars.append((grad, var))

                    if self._lr_scheduler:
                        self._lr_scheduler.set_model_version(
                            self._parameters.version)
                    self._optimizer.apply_gradients(grad_vars)
                    self._grads_n = 0
                    self._grads_buffer.clear()
                    self._parameters.version += 1
                    self._save_params_to_checkpoint_if_needed()
                    version = self._parameters.version
                    updated_version = True

            if updated_version:
                self._report_version_if_needed(version)
            res.model_version = version
            return res
Beispiel #8
0
    def ReportGradient(self, request, _):
        model_version_valid = self._use_async or self._validate_model_version(
            request.model_version)

        res = elasticdl_pb2.ReportGradientResponse()
        if not model_version_valid:
            logger.warning(
                "Task result for outdated version %d dropped",
                request.model_version,
            )
            res.accepted = False
            res.model_version = self._version
            return res

        non_embedding_gradients = {}
        indexed_grads = {}
        edl_embedding_gradients = {}
        # Do sanity check before accumulating gradients.
        for v in request.gradient:
            tensor = Tensor.from_tensor_pb(v)
            name = tensor.name
            if name not in self._model:
                if tensor.is_indexed_slices():
                    # grads of ElasticDL Embedding layer
                    # TODO: check arr.shape[1] = embedding_dim of this
                    # EdlEmbedding layer
                    edl_embedding_gradients[name] = tensor.to_tf_tensor()
                    continue
                else:
                    raise ValueError("Gradient key: %s is not part of model",
                                     name)

            if tensor.is_indexed_slices():
                if (tensor.values.shape[1] !=
                        self._model[name].numpy().shape[1]):
                    raise ValueError(
                        "Gradient key: %s has incompatible "
                        "indexed slice dimension %d, expected %d" % (
                            name,
                            tensor.values.shape[1],
                            self._model[name].numpy().shape[1],
                        ))

                max_index = tf.math.reduce_max(tensor.indices).numpy()
                if max_index >= self._model[name].numpy().shape[0]:
                    raise ValueError(
                        "Gradient key: %s has wrong indices %d, "
                        "out of range %d" % (
                            name,
                            max_index,
                            self._model[name].numpy().shape[0] - 1,
                        ))
                indexed_grads[name] = tensor.to_tf_tensor()
            else:
                if tensor.values.shape != self._model[name].numpy().shape:
                    raise ValueError(
                        "Gradient key: %s has incompatible dimension", name)
                non_embedding_gradients[name] = tensor.to_tf_tensor()

        if not self._use_async:
            self._lock.acquire()
        self._process_gradients(
            edl_embedding_gradients,
            indexed_grads,
            non_embedding_gradients,
            request.model_version,
        )
        if not self._use_async:
            self._lock.release()

        res.accepted = True
        res.model_version = self._version
        return res