def test_check_grad(self): self._clear_params() self.params.init_from_model_pb(self.model_pb) grad0 = Tensor(name="z") with self.assertRaisesRegex(ValueError, "Name error"): self.params.check_grad(grad0) grad1 = Tensor(name="x", values=np.random.uniform(size=(3, 5))) with self.assertRaisesRegex(ValueError, "Non embedding param error"): self.params.check_grad(grad1) grad2 = Tensor( name="embedding_1", values=np.random.uniform(size=(3, 11)), indices=np.array([1, 2, 3]), ) with self.assertRaisesRegex(ValueError, "ElasticDL embedding param error"): self.params.check_grad(grad2) grad3 = Tensor( name="x", values=np.random.uniform(size=(4, 4)), indices=np.array([1, 2, 3, 4]), ) with self.assertRaisesRegex(ValueError, "Keras embedding param error"): self.params.check_grad(grad3)
def setUp(self): self.params = Parameters() self.model_pb = Model() self.tensors_pb = self.model_pb.param self.embeddings_pb = self.model_pb.embedding_table_info arr1 = np.random.uniform(size=(3, 4)) tensor1_pb = Tensor(arr1, name="x").to_tensor_pb() arr2 = np.random.uniform(size=(4, 5)) tensor2_pb = Tensor(arr2, name="y").to_tensor_pb() self.tensors_pb.extend([tensor1_pb, tensor2_pb]) self.embedding_table_name = "embedding_1" self.embedding_dim = 10 embedding_pb = EmbeddingTableInfo() embedding_pb.name = self.embedding_table_name embedding_pb.dim = self.embedding_dim embedding_pb.initializer = "uniform" embedding_vectors = np.random.uniform(size=(2, 10)) embedding_indices = np.array([0, 8]) embedding_tensor = Tensor( embedding_vectors, indices=embedding_indices, name=self.embedding_table_name, ) embedding_tensor_pb = embedding_tensor.to_tensor_pb() self.tensors_pb.append(embedding_tensor_pb) self.embeddings_pb.append(embedding_pb)
def verify(values, name=None, indices=None): tensor = Tensor(values, indices, name) pb = elasticdl_pb2.Tensor() serialize_tensor(tensor, pb) tensor_new = Tensor() deserialize_tensor_pb(pb, tensor_new) np.testing.assert_array_equal(values, tensor_new.values) if indices is not None: np.testing.assert_array_equal(indices, tensor_new.indices) if name: self.assertEqual(name, tensor.name)
def testEvaluationJob(self): model_version = 1 total_tasks = 5 latest_chkp_version = 2 job = EvaluationJob(_eval_metrics_fn(), model_version, total_tasks) self.assertEqual(0, job._completed_tasks) self.assertFalse(job.finished()) self.assertFalse(self.ok_to_new_job(job, latest_chkp_version)) # Now make 4 tasks finished for i in range(4): job.complete_task() self.assertEqual(4, job._completed_tasks) self.assertFalse(job.finished()) self.assertFalse(self.ok_to_new_job(job, latest_chkp_version)) # One more task finishes job.complete_task() self.assertEqual(5, job._completed_tasks) self.assertTrue(job.finished()) self.assertTrue(self.ok_to_new_job(job, latest_chkp_version)) # No new model checkpoint latest_chkp_version = job.model_version self.assertFalse(self.ok_to_new_job(job, latest_chkp_version)) latest_chkp_version = job.model_version + 1 self.assertTrue(self.ok_to_new_job(job, latest_chkp_version)) model_outputs = [ Tensor( np.array([[1], [6], [3]], np.float32), name=MetricsDictKey.MODEL_OUTPUT, ).to_tensor_pb() ] labels = Tensor(np.array([[1], [0], [3]], np.float32)).to_tensor_pb() job.report_evaluation_metrics(model_outputs, labels) job.report_evaluation_metrics( [ Tensor( np.array([[4], [5], [6], [7], [8]], np.float32), name=MetricsDictKey.MODEL_OUTPUT, ).to_tensor_pb() ], Tensor(np.array([[7], [8], [9], [10], [11]], np.float32)).to_tensor_pb(), ) expected_acc = 0.25 evaluation_metrics = job.get_evaluation_summary() self.assertAlmostEqual(expected_acc, evaluation_metrics.get("acc").numpy()) self.assertAlmostEqual(expected_acc, evaluation_metrics.get("acc_fn").numpy()) self.assertAlmostEqual(10.125, evaluation_metrics.get("mse").numpy())
def get_model(self): model_version = -1 variable_future_and_id_pairs = [] req = empty_pb2.Empty() if self._use_multi_ps: self.init_ps_var_partition() for ps_id, stub in enumerate(self._ps_stubs): if ps_id not in self._ps_vars: continue # async grpc call var_future = stub.pull_variable.future(req) variable_future_and_id_pairs.append((var_future, ps_id)) for var_future, ps_id in variable_future_and_id_pairs: res = var_future.result() if not res.model_init_status: # push variable to ps for initialization self.report_variable_to_ps(ps_id) res = self._ps_stubs[ps_id].pull_variable(req) if not res.model_init_status: # TODO: support PS fault-tolerance raise RuntimeError("PS pod %d cannot be initialized" % ps_id) for tensor_pb in res.model.param: tensor = Tensor.from_tensor_pb(tensor_pb) self._non_embed_vars[tensor.name].assign(tensor.to_ndarray()) model_version = max(model_version, res.model.version) self._model_version = model_version
def get_model(self): self._timing.start_record_time("get_model") variable_future_and_id_pairs = [] if self._use_multi_ps: self.init_ps_var_partition() for ps_id, stub in enumerate(self._ps_stubs): if ps_id not in self._ps_vars: continue # async grpc call req = elasticdl_pb2.PullVariableRequest() req.current_model_version = self._model_versions_from_ps[ps_id] var_future = stub.pull_variable.future(req) variable_future_and_id_pairs.append((var_future, ps_id)) for var_future, ps_id in variable_future_and_id_pairs: res = var_future.result() if not res.model_init_status: # push variable to ps for initialization self.report_variable_to_ps(ps_id) req = elasticdl_pb2.PullVariableRequest() req.current_model_version = self._model_versions_from_ps[ps_id] res = self._ps_stubs[ps_id].pull_variable(req) if not res.model_init_status: # TODO: support PS fault-tolerance raise RuntimeError("PS pod %d cannot be initialized" % ps_id) for tensor_pb in res.model.param: tensor = Tensor.from_tensor_pb(tensor_pb) self._non_embed_vars[tensor.name].assign(tensor.to_ndarray()) self._model_versions_from_ps[ps_id] = res.model.version self._model_version = max(self._model_versions_from_ps) self._timing.end_record_time("get_model")
def get_params_shard_from_pb(model_pb, shard_index, shard_num): """Get parameters including variables values and embedding table from a model protobuf. Args: model_pb: A Model protobuf instance. shard_index: Model shard index. shard_num: The total number of model shards. Return: non_embedding_vars: A Python dict in which the key is a variable name and the value is a `tf.Variable` object. embedding_table_values: A Python dict in which the key is an embedding table name and the value is a tuple with 2 elements. The value[0] is indices and value[1] is the corresponding embedding vector. """ non_embedding_vars = {} embedding_table_values = {} for tensor_pb in model_pb.param: tensor = Tensor.from_tensor_pb(tensor_pb) if tensor.indices is not None: embedding_table_values.setdefault(tensor.name, ([], [])) for embedding_id, vector in zip(tensor.indices, tensor.values): if int_to_id(embedding_id, shard_num) == shard_index: embedding_table_values[tensor.name][0].append(embedding_id) embedding_table_values[tensor.name][1].append(vector) else: if string_to_id(tensor.name, shard_num) == shard_index: non_embedding_vars[tensor.name] = tf.Variable( initial_value=tensor.values, trainable=True) return non_embedding_vars, embedding_table_values
def pull_embedding_vector(self, request, _): ret = elasticdl_pb2.Tensor() if not request.ids: return ret embedding_vectors = self._parameters.get_embedding_param( request.name, request.ids) tensor = Tensor(values=embedding_vectors) serialize_tensor(tensor, ret) return ret
def to_tensor(self): """Convert the embedding table to elasticDL Tensor""" indices = [] embedding_vectors = [] for id, embedding_vector in self.embedding_vectors.items(): indices.append(id) embedding_vectors.append(embedding_vector) return Tensor( values=np.array(embedding_vectors), indices=np.array(indices), name=self.name, )
def _get_non_embedding_variables(self, version, method): """Get model from master, and update model_version """ req = elasticdl_pb2.GetModelRequest() req.version = version req.method = method model = self._stub.GetModel(req, None) variables = {} for tensor_pb in model.param: tensor = Tensor.from_tensor_pb(tensor_pb) variables[tensor.name] = tensor.to_ndarray() return variables
def test_deserialize_tensor_pb(self): pb = elasticdl_pb2.Tensor() tensor = Tensor() # No dim defined, should raise. self.assertRaises(ValueError, deserialize_tensor_pb, pb, tensor) # Empty array, should be ok. pb.dim.append(0) pb.content = b"" pb.dtype = tensor_dtype_pb2.DT_FLOAT32 deserialize_tensor_pb(pb, tensor) np.testing.assert_array_equal(np.array([], dtype=np.float32), tensor.values) # Wrong type, should raise del pb.dim[:] pb.dim.append(0) pb.content = b"" pb.dtype = tensor_dtype_pb2.DT_INVALID self.assertRaises(ValueError, deserialize_tensor_pb, pb, tensor) # Pathological case, one of the dimensions is 0. del pb.dim[:] pb.dim.extend([2, 0, 1, 9]) pb.content = b"" pb.dtype = tensor_dtype_pb2.DT_FLOAT32 deserialize_tensor_pb(pb, tensor) np.testing.assert_array_equal( np.ndarray(shape=[2, 0, 1, 9], dtype=np.float32), tensor.values) # Wrong content size, should raise del pb.dim[:] pb.dim.append(11) pb.content = b"\0" * (4 * 12) pb.dtype = tensor_dtype_pb2.DT_FLOAT32 self.assertRaises(ValueError, deserialize_tensor_pb, pb, tensor) # Compatible dimensions, should be ok. for m in (1, 2, 3, 4, 6, 12): for with_inidices in [True, False]: del pb.dim[:] pb.content = b"\0" * (4 * 12) pb.dim.extend([m, 12 // m]) if with_inidices: pb.indices.extend([0] * m) pb.dtype = tensor_dtype_pb2.DT_FLOAT32 deserialize_tensor_pb(pb, tensor) self.assertEqual((m, 12 // m), tensor.values.shape) self.assertTrue(isinstance(tensor.values, np.ndarray)) if tensor.indices is not None: self.assertTrue(isinstance(tensor.indices, np.ndarray))
def get_model_from_master(self, version, method): """ get model from master, and update model_version """ req = elasticdl_pb2.GetModelRequest() req.version = version req.method = method model = self._stub.GetModel(req) # Assumes all trainable variables exist in model.param. for tensor_pb in model.param: tensor = Tensor.from_tensor_pb(tensor_pb) self._non_embed_vars[tensor.name].assign(tensor.to_ndarray()) self._model_version = model.version
def report_evaluation_metrics(self, model_outputs, labels): """ report evaluation metrics to ps. """ req = elasticdl_pb2.ReportEvaluationMetricsRequest() for name, output in model_outputs.items(): output = np.concatenate(output) emplace_tensor_pb_from_ndarray(req.model_outputs, output, name=name) labels = np.concatenate(labels) tensor = Tensor(values=labels) serialize_tensor(tensor, req.labels) self._stub.report_evaluation_metrics(req)
def test_emplace_tensor_pb_from_ndarray(self): values = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], np.float32) indices = np.array([0, 2]) name = "test" model = elasticdl_pb2.Model() emplace_tensor_pb_from_ndarray(model.param, values, indices, name) pb = model.param[-1] print("pb", pb) expected_pb = Tensor(values, indices, name).to_tensor_pb() self.assertEqual(pb.name, expected_pb.name) self.assertEqual(pb.dim, expected_pb.dim) self.assertEqual(pb.content, expected_pb.content) self.assertEqual(pb.indices, expected_pb.indices) self.assertEqual(pb.dtype, expected_pb.dtype)
def _restore_params_from_pb(self, tensors_pb): for pb in tensors_pb: name = pb.name if not pb.indices: # Please note that `tf.Variable` will do something with magic. # If you pass a name "somename" to a `tf.Variable`, the final # variable name will be "somename:0". So the `tf.Variable.name` # is meaningless, we must avoid use it in PS side. arr = tensor_pb_to_ndarray(pb) var = tf.Variable(initial_value=arr, trainable=True) self.non_embedding_params[name] = var else: # Only pb of embedding parameters has indices. tensor = Tensor() deserialize_tensor_pb(pb, tensor) self.embedding_params[name].set(tensor.indices, tensor.values)
def report_evaluation_metrics(self, model_outputs, labels): """ report evaluation metrics to ps, return (accepted, model_version) from rpc call. """ req = elasticdl_pb2.ReportEvaluationMetricsRequest() for name, output in model_outputs.items(): output = np.concatenate(output) emplace_tensor_pb_from_ndarray(req.model_outputs, output, name=name) labels = np.concatenate(labels) tensor = Tensor(values=labels) serialize_tensor(tensor, req.labels) req.model_version = self._model_version if self._use_multi_ps else -1 res = self._stub.ReportEvaluationMetrics(req) return res.accepted, res.model_version
def ReportGradient(self, request, _): model_version_valid = self._use_async or self._validate_model_version( request.model_version) res = elasticdl_pb2.ReportGradientResponse() if not model_version_valid: logger.warning( "Task result for outdated version %d dropped", request.model_version, ) res.accepted = False res.model_version = self._version return res non_embedding_gradients = {} indexed_grads = {} edl_embedding_gradients = {} # Do sanity check before accumulating gradients. for v in request.gradient: tensor = Tensor.from_tensor_pb(v) name = tensor.name if name not in self._model: if tensor.is_indexed_slices(): # grads of ElasticDL Embedding layer # TODO: check arr.shape[1] = embedding_dim of this # EdlEmbedding layer edl_embedding_gradients[name] = tensor.to_tf_tensor() continue else: raise ValueError("Gradient key: %s is not part of model", name) if tensor.is_indexed_slices(): if (tensor.values.shape[1] != self._model[name].numpy().shape[1]): raise ValueError( "Gradient key: %s has incompatible " "indexed slice dimension %d, expected %d" % ( name, tensor.values.shape[1], self._model[name].numpy().shape[1], )) max_index = tf.math.reduce_max(tensor.indices).numpy() if max_index >= self._model[name].numpy().shape[0]: raise ValueError( "Gradient key: %s has wrong indices %d, " "out of range %d" % ( name, max_index, self._model[name].numpy().shape[0] - 1, )) indexed_grads[name] = tensor.to_tf_tensor() else: if tensor.values.shape != self._model[name].numpy().shape: raise ValueError( "Gradient key: %s has incompatible dimension", name) non_embedding_gradients[name] = tensor.to_tf_tensor() if not self._use_async: self._lock.acquire() self._process_gradients( edl_embedding_gradients, indexed_grads, non_embedding_gradients, request.model_version, ) if not self._use_async: self._lock.release() res.accepted = True res.model_version = self._version return res
def push_gradient(self, request, _): res = elasticdl_pb2.PushGradientResponse() if self._use_async: grad_vars = [] for pb in request.gradients: grad = Tensor.from_tensor_pb(pb) self._parameters.check_grad(grad) name = grad.name var = self._parameters.get_non_embedding_param(name) grad = grad.to_tf_tensor() if var is None: grad_vars.append((grad, name)) else: grad_vars.append((grad, var)) if self._lr_scheduler: self._lr_scheduler.set_model_version(self._parameters.version) self._optimizer.apply_gradients(grad_vars) with self._version_lock: self._parameters.version += 1 self._save_params_to_checkpoint_if_needed() version = self._parameters.version self._report_version_if_needed(version) res.accepted = True res.model_version = self._parameters.version return res else: if (request.model_version < self._parameters.version - self._sync_version_tolerance): res.accepted = False res.model_version = self._parameters.version return res with self._lock: for pb in request.gradients: grad = Tensor.from_tensor_pb(pb) self._parameters.check_grad(grad) if grad.name in self._grads_buffer: self._grads_buffer[grad.name] = ( self._grads_buffer[grad.name] + grad) else: self._grads_buffer[grad.name] = grad self._grads_n += 1 res.accepted = True updated_version = False version = self._parameters.version if self._grads_n == self._grads_to_wait: grad_vars = [] for name, grad in self._grads_buffer.items(): # Dense gradients are averaged, # while sparse gradients are summed if not grad.is_indexed_slices(): grad.values = grad.values / self._grads_to_wait var = self._parameters.get_non_embedding_param(name) grad = grad.to_tf_tensor() if var is None: grad_vars.append((grad, name)) else: grad_vars.append((grad, var)) if self._lr_scheduler: self._lr_scheduler.set_model_version( self._parameters.version) self._optimizer.apply_gradients(grad_vars) self._grads_n = 0 self._grads_buffer.clear() self._parameters.version += 1 self._save_params_to_checkpoint_if_needed() version = self._parameters.version updated_version = True if updated_version: self._report_version_if_needed(version) res.model_version = version return res
def test_tensor_data_structure(self): # Test tensor values, without indices arr = np.ndarray(shape=[3, 1, 2, 4], dtype=np.int32) tensor = Tensor(arr) self.assertTrue(np.array_equal(arr, tensor.values)) self.assertTrue(np.array_equal(arr, tensor.to_tf_tensor())) self.assertFalse(tensor.is_indexed_slices()) # Test tensor values, with indices indices = np.array([2, 0, 1]) tensor = Tensor(arr, indices) self.assertTrue(np.array_equal(arr, tensor.values)) self.assertTrue(np.array_equal(indices, tensor.indices)) self.assertTrue(np.array_equal(arr, tensor.to_tf_tensor().values)) self.assertTrue(np.array_equal(indices, tensor.to_tf_tensor().indices)) self.assertTrue(tensor.is_indexed_slices()) # Test round trip # tensor to tensor PB tensor = Tensor(arr, indices, name="test") pb = tensor.to_tensor_pb() self.assertEqual(pb.name, "test") self.assertEqual(pb.dim, [3, 1, 2, 4]) self.assertEqual(pb.dtype, tensor_dtype_pb2.DT_INT32) np.testing.assert_array_equal(pb.indices, indices) # tensor PB to tensor tensor_new = Tensor.from_tensor_pb(pb) self.assertEqual(tensor.name, "test") np.testing.assert_array_equal(tensor_new.values, arr) np.testing.assert_array_equal(tensor_new.indices, indices) # Test Tensor().to_ndarray() values = np.array([[1.0, 2.0], [3.0, 4.0]]) indices = np.array([0, 2]) name = "test" tensor = Tensor(values, indices, name) self.assertRaises(NotImplementedError, tensor.to_ndarray) tensor = Tensor(values, name=name) self.assertTrue(np.allclose(values, tensor.to_ndarray()))
def testEvaluationService(self): with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testEvaluationService") checkpoint_service = CheckpointService(chkp_dir, 5, 5, True) task_d = _TaskDispatcher( { "f1": (0, 10), "f2": (0, 10) }, { "f1": (0, 10), "f2": (0, 10) }, {}, 3, 1, ) # Evaluation metrics will not be accepted if no evaluation ongoing evaluation_service = EvaluationService( checkpoint_service, None, task_d, 10, 20, 0, False, _eval_metrics_fn, ) model_outputs = [ Tensor( np.array([1, 6, 3], np.float32), name=MetricsDictKey.MODEL_OUTPUT, ).to_tensor_pb() ] labels = Tensor(np.array([1, 0, 3], np.float32)).to_tensor_pb() self.assertFalse( evaluation_service.report_evaluation_metrics( 1, model_outputs, labels)) # No checkpoint available self.assertFalse(evaluation_service.try_to_create_new_job()) master = MasterServicer( 2, 2, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) # Add an evaluation task and we can start evaluation self.assertEqual(8, len(task_d._todo)) evaluation_service.add_evaluation_task(False) self.assertEqual(8, len(task_d._eval_todo)) self.assertFalse(evaluation_service._eval_job.finished()) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job is None) self.assertFalse(evaluation_service.try_to_create_new_job())
def _ndarray_to_tensor_pb(values, name=None, indices=None): return Tensor(values, indices, name).to_tensor_pb()