def test_ndarray_to_tensor(self): # Wrong type, should raise arr = np.array([1, 2, 3, 4]) self.assertRaises(ValueError, ndarray_to_tensor, arr) # Empty array arr = np.array([], dtype=np.float32) t = ndarray_to_tensor(arr) self.assertEqual([0], t.dim) self.assertEqual(0, len(t.content)) # Pathological case, one of the dimensions is 0. arr = np.ndarray(shape=[2, 0, 1, 9], dtype=np.float32) t = ndarray_to_tensor(arr) self.assertEqual([2, 0, 1, 9], t.dim) self.assertEqual(0, len(t.content)) # 1-D array arr = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32) t = ndarray_to_tensor(arr) self.assertEqual([4], t.dim) self.assertEqual(4 * 4, len(t.content)) # 4-D random array arr = np.ndarray(shape=[2, 1, 3, 4], dtype=np.float32) t = ndarray_to_tensor(arr) self.assertEqual([2, 1, 3, 4], t.dim) self.assertEqual(4 * 2 * 1 * 3 * 4, len(t.content))
def report_gradient(self, grads): """ report gradient to ps, return (accepted, model_version) from rpc call. """ req = elasticdl_pb2.ReportGradientRequest() origin_vars = self._model.trainable_variables origin_var_n = len(origin_vars) # should keep the same order as self.get_trainable_items() for g, v in zip(grads[:origin_var_n], origin_vars): if isinstance(g, tf.IndexedSlices): req.gradient[v.name].CopyFrom( ndarray_to_tensor( g.values.numpy(), tuple(g.indices.numpy()) ) ) else: req.gradient[v.name].CopyFrom(ndarray_to_tensor(g.numpy())) # deal with gradients of ElasticDL embedding layer # should keep the same order as self.get_trainable_items() if self._embedding_layers: grads_edlembedding = grads[origin_var_n:] bet_number = 0 for layer in self._embedding_layers: bet_number += len(layer.bet_ids_pair) if len(grads_edlembedding) != bet_number: raise ValueError( "elasticdl.layers.embedding related gradient number %d " "does not match the number of its output tensor %d." % (len(grads_edlembedding), bet_number) ) it = 0 for layer in self._embedding_layers: g_values = None g_indices = None for bet, ids in layer.bet_ids_pair: grad = grads_edlembedding[it] it += 1 # ElasticDL embedding layer with Sparse Gradients if isinstance(grad, tf.IndexedSlices): grad = grad.values if g_values is not None: g_values = tf.concat([g_values, grad], axis=0) g_indices = tf.concat([g_indices, ids], axis=0) else: g_values = grad g_indices = ids req.gradient[layer.name].CopyFrom( ndarray_to_tensor( g_values.numpy(), tuple(g_indices.numpy()) ) ) req.model_version = self._model_version res = self._stub.ReportGradient(req) return res.accepted, res.model_version
def testEvaluationJob(self): model_version = 1 total_tasks = 5 latest_chkp_version = 2 job = _EvaluationJob(model_version, total_tasks) self.assertEqual(0, job._completed_tasks) self.assertFalse(job.finished()) self.assertFalse(self.ok_to_new_job(job, latest_chkp_version)) # Now make 4 tasks finished for i in range(4): job.complete_task() self.assertEqual(4, job._completed_tasks) self.assertFalse(job.finished()) self.assertFalse(self.ok_to_new_job(job, latest_chkp_version)) # One more task finishes job.complete_task() self.assertEqual(5, job._completed_tasks) self.assertTrue(job.finished()) self.assertTrue(self.ok_to_new_job(job, latest_chkp_version)) # No new model checkpoint latest_chkp_version = job.model_version self.assertFalse(self.ok_to_new_job(job, latest_chkp_version)) latest_chkp_version = job.model_version + 1 self.assertTrue(self.ok_to_new_job(job, latest_chkp_version)) # At the beginning, no metrics self.assertFalse(job._evaluation_metrics) # Start to report metrics evaluation_version = job.model_version + 1 evaluation_metrics = { "mse": ndarray_to_tensor(np.array([100, 200], dtype=np.float32)) } self.assertFalse( job.report_evaluation_metrics( evaluation_version, evaluation_metrics ) ) self.assertFalse(job._evaluation_metrics) evaluation_version = job.model_version self.assertTrue( job.report_evaluation_metrics( evaluation_version, evaluation_metrics ) ) # One more evaluation_metrics = { "mse": ndarray_to_tensor(np.array([300, 400], dtype=np.float32)) } job.report_evaluation_metrics(evaluation_version, evaluation_metrics) self.assertTrue( np.array_equal( np.array([200, 300], dtype=np.float32), job.get_evaluation_summary().get("mse"), ) )
def report_gradient(self, grads): """ report gradient to ps, return (accepted, model_version) from rpc call. """ req = elasticdl_pb2.ReportGradientRequest() non_embed_vars_n = len(self._non_embed_vars) # The first `non_embed_vars_n` items in `grads` are gradients for # `self._non_embed_vars` for g, v in zip(grads[:non_embed_vars_n], self._non_embed_vars): if isinstance(g, tf.IndexedSlices): req.gradient[v.name].CopyFrom( ndarray_to_tensor(g.values.numpy(), tuple(g.indices.numpy()))) else: req.gradient[v.name].CopyFrom(ndarray_to_tensor(g.numpy())) # Accumulate gradients of ElasticDL embedding layer if self._embedding_layers: # The `edl_embedding_grads` are gradients for bets in # `self._embedding_layers` edl_embedding_grads = grads[non_embed_vars_n:] # Check that the number of bet equal to the number of gradients. # Please note that every embedding layer may have more than one # `bet_id_pair`. bet_number = 0 for layer in self._embedding_layers: bet_number += len(layer.embedding_and_ids) if len(edl_embedding_grads) != bet_number: raise ValueError( "elasticdl.layers.embedding related gradient number %d " "does not match the number of its output tensor %d." % (len(edl_embedding_grads), bet_number)) grad_accum_iter = 0 for layer in self._embedding_layers: g_values = None g_indices = None for _, ids in layer.embedding_and_ids: grad = edl_embedding_grads[grad_accum_iter] grad_accum_iter += 1 # ElasticDL embedding layer with Sparse Gradients if isinstance(grad, tf.IndexedSlices): grad = grad.values if g_values is not None: g_values = tf.concat([g_values, grad], axis=0) g_indices = tf.concat([g_indices, ids], axis=0) else: g_values = grad g_indices = ids req.gradient[layer.name].CopyFrom( ndarray_to_tensor(g_values.numpy(), tuple(g_indices.numpy()))) req.model_version = self._model_version res = self._stub.ReportGradient(req) return res.accepted, res.model_version
def makeGrad(): """ Make a ReportGradientRequest compatible with model""" req = elasticdl_pb2.ReportGradientRequest() req.gradient["x"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32))) req.gradient["y"].CopyFrom( ndarray_to_tensor(np.array([0.03, 0.06], dtype=np.float32))) req.model_version = 1 return req
def report_evaluation_metrics(self, model_outputs, labels): """ report evaluation metrics to ps, return (accepted, model_version) from rpc call. """ req = elasticdl_pb2.ReportEvaluationMetricsRequest() if not isinstance(model_outputs, dict): model_outputs = {MetricsDictKey.MODEL_OUTPUT: model_outputs} for name, output in model_outputs.items(): req.model_outputs[name].CopyFrom(ndarray_to_tensor(output.numpy())) req.labels.CopyFrom(ndarray_to_tensor(labels.numpy())) req.model_version = self._model_version res = self._stub.ReportEvaluationMetrics(req) return res.accepted, res.model_version
def report_variable(self): """ report variable to ps. """ req = elasticdl_pb2.ReportVariableRequest() for v in self._model.trainable_variables: req.variable[v.name].CopyFrom(ndarray_to_tensor(v.numpy())) self._stub.ReportVariable(req)
def testEvaluationService(self): with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testEvaluationService") checkpoint_service = CheckpointService(chkp_dir, 5, 5, True) task_d = _TaskDispatcher( {"f1": (0, 10), "f2": (0, 10)}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1, ) # Evaluation metrics will not be accepted if no evaluation ongoing evaluation_service = EvaluationService( checkpoint_service, None, task_d, 10, 20, 0, False ) evaluation_metrics = { "mse": ndarray_to_tensor( np.array([100, 200], dtype=np.float32) ) } self.assertFalse( evaluation_service.report_evaluation_metrics( 1, evaluation_metrics ) ) # No checkpoint available self.assertFalse(evaluation_service.try_to_create_new_job()) master = MasterServicer( 2, 2, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) # Add an evaluation task and we can start evaluation self.assertEqual(8, len(task_d._todo)) evaluation_service.add_evaluation_task(0) self.assertEqual(16, len(task_d._todo)) self.assertFalse(evaluation_service._eval_job.finished()) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job is None) self.assertFalse(evaluation_service.try_to_create_new_job())
def report_evaluation_metrics(self, evaluation_metrics): """ report evaluation metrics to ps, return (accepted, model_version) from rpc call. """ req = elasticdl_pb2.ReportEvaluationMetricsRequest() for k, v in evaluation_metrics.items(): v_np = v.numpy() # If scalar, convert to numpy 1D array with size 1 if not v_np.shape: v_np = v_np.reshape(1) req.evaluation_metrics[k].CopyFrom(ndarray_to_tensor(v_np)) req.model_version = self._model_version res = self._stub.ReportEvaluationMetrics(req) return res.accepted, res.model_version
def _get_model_no_lock(self): pb_model = elasticdl_pb2.Model() pb_model.version = self._version for k, v in self._model.items(): pb_model.param[k].CopyFrom(ndarray_to_tensor(v.numpy())) return pb_model
def testReportGradient(self): def makeGrad(): """ Make a ReportGradientRequest compatible with model""" req = elasticdl_pb2.ReportGradientRequest() req.gradient["x"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) req.gradient["y"].CopyFrom( ndarray_to_tensor(np.array([0.03, 0.06], dtype=np.float32)) ) req.model_version = 1 return req master = MasterServicer( 3, 3, tf.optimizers.SGD(0.1), None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) master._version = 1 master.set_model_var("x", np.array([2.0], dtype=np.float32)) master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32)) # Report a future version, should raise exception req = makeGrad() req.model_version = 2 self.assertRaises(ValueError, master.ReportGradient, req, None) # Report an old version, should not be accepted req = makeGrad() req.model_version = 0 res = master.ReportGradient(req, None) self.assertFalse(res.accepted) self.assertEqual(1, res.model_version) # Report a unknown gradient, should raise. req = makeGrad() req.gradient["z"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) self.assertRaises(ValueError, master.ReportGradient, req, None) # Report an incompatible gradient, should raise. req = makeGrad() req.gradient["y"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) self.assertRaises(ValueError, master.ReportGradient, req, None) # Report a current version, should be accepted. req = makeGrad() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(1, res.model_version) # Report a current version with part of gradients, should be accepted. req = makeGrad() del req.gradient["y"] res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(1, res.model_version) # Gradient should be accumulated. np.testing.assert_array_equal( np.array([0.2], dtype=np.float32), master._gradient_sum["x"] ) np.testing.assert_array_equal( np.array([0.03, 0.06], dtype=np.float32), master._gradient_sum["y"] ) self.assertEqual(2, master._grad_n) # Report a current version, should be accepted, and a new version # created req = makeGrad() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(2, res.model_version) self.assertFalse(master._gradient_sum) self.assertEqual(0, master._grad_n) np.testing.assert_array_equal( # [2] - 0.1 * [0.1] np.array([1.99], dtype=np.float32), master._model["x"].numpy(), ) np.testing.assert_array_equal( # [12, 13] - 0.1 * [0.02, 0.04] np.array([11.998, 12.996], dtype=np.float32), master._model["y"].numpy(), )
def verify(a): b = tensor_to_ndarray(ndarray_to_tensor(a)) np.testing.assert_array_equal(a, b)
def testEvaluationJob(self): model_version = 1 total_tasks = 5 latest_chkp_version = 2 job = _EvaluationJob(_eval_metrics_fn(), model_version, total_tasks) self.assertEqual(0, job._completed_tasks) self.assertFalse(job.finished()) self.assertFalse(self.ok_to_new_job(job, latest_chkp_version)) # Now make 4 tasks finished for i in range(4): job.complete_task() self.assertEqual(4, job._completed_tasks) self.assertFalse(job.finished()) self.assertFalse(self.ok_to_new_job(job, latest_chkp_version)) # One more task finishes job.complete_task() self.assertEqual(5, job._completed_tasks) self.assertTrue(job.finished()) self.assertTrue(self.ok_to_new_job(job, latest_chkp_version)) # No new model checkpoint latest_chkp_version = job.model_version self.assertFalse(self.ok_to_new_job(job, latest_chkp_version)) latest_chkp_version = job.model_version + 1 self.assertTrue(self.ok_to_new_job(job, latest_chkp_version)) # Start to report metrics evaluation_version = job.model_version + 1 model_outputs = { MetricsDictKey.MODEL_OUTPUT: ndarray_to_tensor(np.array([[1], [6], [3]], dtype=np.float32)) } labels = ndarray_to_tensor(np.array([[1], [0], [3]], dtype=np.float32)) self.assertFalse( job.report_evaluation_metrics(evaluation_version, model_outputs, labels)) evaluation_version = job.model_version self.assertTrue( job.report_evaluation_metrics(evaluation_version, model_outputs, labels)) # One more self.assertTrue( job.report_evaluation_metrics( evaluation_version, { MetricsDictKey.MODEL_OUTPUT: ndarray_to_tensor( np.array([[4], [5], [6], [7], [8]], dtype=np.float32)) }, ndarray_to_tensor( np.array([[7], [8], [9], [10], [11]], dtype=np.float32)), )) expected_acc = 0.25 evaluation_metrics = job.get_evaluation_summary() self.assertAlmostEqual(expected_acc, evaluation_metrics.get("acc").numpy()) self.assertAlmostEqual(expected_acc, evaluation_metrics.get("acc_fn").numpy()) self.assertAlmostEqual(10.125, evaluation_metrics.get("mse").numpy())