def test_check_grad(self): self.params.reset() self.params.init_from_model_pb(self.model_pb) grad0 = Tensor("z", None, None) with self.assertRaisesRegex(ValueError, "Name error"): self.params.check_grad(grad0) grad1 = Tensor("x", np.random.uniform(size=(3, 5)), None) with self.assertRaisesRegex(ValueError, "Non embedding param error"): self.params.check_grad(grad1) grad2 = Tensor( name="embedding_1", values=np.random.uniform(size=(3, 11)), indices=np.array([1, 2, 3]), ) with self.assertRaisesRegex( ValueError, "ElasticDL embedding param error" ): self.params.check_grad(grad2) grad3 = Tensor( name="x", values=np.random.uniform(size=(4, 4)), indices=np.array([1, 2, 3, 4]), ) with self.assertRaisesRegex(ValueError, "Keras embedding param error"): self.params.check_grad(grad3)
def setUp(self): self.params = Parameters() self.model_pb = Model() self.infos_pb = self.model_pb.embedding_table_infos self.tensors_pb = self.model_pb.dense_parameters self.embedding_tables_pb = self.model_pb.embedding_tables self.embedding_table_name = "embedding_1" self.embedding_dim = 10 embedding_pb = self.infos_pb.add() embedding_pb.name = self.embedding_table_name embedding_pb.dim = self.embedding_dim embedding_pb.initializer = "uniform" arr1 = np.random.uniform(size=(3, 4)) serialize_ndarray(arr1, self.tensors_pb["x"]) arr2 = np.random.uniform(size=(4, 5)) serialize_ndarray(arr2, self.tensors_pb["y"]) embedding_vectors = np.random.uniform(size=(2, 10)) embedding_indices = np.array([0, 8]) serialize_indexed_slices( Tensor(None, embedding_vectors, embedding_indices), self.embedding_tables_pb[self.embedding_table_name], )
def _get_model(self): self._timing.start_record_time("get_model") # 1. Worker tries to pull dense parameters from the PS, maybe one # or more PS instances are uninitialized. dense_params, uninit_ps = self._ps_client.pull_dense_parameters( [i for i in range(self._ps_client.ps_num)], self._model_versions_from_ps, ) # 2. Worker pushes local dense parameters to these PS instances # to initialize their partition of parameters. if len(uninit_ps) > 0: for ps_id in uninit_ps: # push variable to ps for initialization parameters = [ Tensor(name, self._non_embed_vars[name].numpy(), None) for name in self._ps_client.ps_to_parameter[ps_id] ] self._ps_client.push_dense_parameters( parameters, ps_id, self._model_versions_from_ps[ps_id]) ps_params, uninit = self._ps_client.pull_dense_parameters( uninit_ps, self._model_versions_from_ps) if len(uninit) > 0: # TODO: support PS fault-tolerance raise RuntimeError("PS initialization failed") dense_params.update(ps_params) # 3. Assign parameters to local model for k, v in dense_params.items(): self._non_embed_vars[k].assign(v) self._model_version = max(self._model_versions_from_ps) self._timing.end_record_time("get_model")
def report_gradient_to_ps(self, gradients): self._timing.start_record_time("report_gradient") grads = [] for i, v in enumerate(self._non_embed_vars.values()): if isinstance(gradients[i], tf.IndexedSlices): grad = Tensor( v.name, gradients[i].values.numpy(), gradients[i].indices.numpy(), ) else: grad = Tensor(v.name, gradients[i].numpy(), None) grads.append(grad) edl_grads = [] edl_embedding_name_values = self._collect_edl_embedding_name_values() if edl_embedding_name_values: non_embed_vars_n = len(self._non_embed_vars) edl_embedding_grads = gradients[non_embed_vars_n:] bet_number = 0 for name, embedding_and_ids in edl_embedding_name_values: for i in range(bet_number): grad = Tensor( name, edl_embedding_grads[i + bet_number].values.numpy(), edl_embedding_grads[i + bet_number].indices.numpy(), ) edl_grads.append(grad) bet_number += len(embedding_and_ids) if len(edl_embedding_grads) != bet_number: raise ValueError( "elasticdl.layers.embedding related gradient number %d " "does not match the number of its output tensor %d." % (len(edl_embedding_grads), bet_number)) learning_rate = K.get_value(self._model.optimizer.lr) accepted, max_version = self._ps_client.push_gradients( grads, edl_grads, learning_rate, self._model_versions_from_ps, ) self._timing.end_record_time("report_gradient") return accepted, max_version
def push_gradient_test_setup(self): self.var_names = ["test_1", "test_2"] self.var_values = [ np.array([10.0, 20.0, 30.0], np.float32), np.array([20.0, 40.0, 60.0], np.float32), ] self.grad_values0 = [ np.array([1.0, 2.0, 3.0], np.float32), np.array([2.0, 4.0, 6.0], np.float32), ] self.grad_values1 = [ np.array([0.0, 0.0, 7.0], np.float32), np.array([9.0, 9.0, 6.0], np.float32), ] dim = self._embedding_info.dim self.embedding_table = ( np.random.rand(4 * dim).reshape((4, dim)).astype(np.float32) ) self.embedding_grads0 = Tensor( None, np.random.rand(3 * dim).reshape((3, dim)).astype(np.float32), np.asarray([3, 1, 3]), ) self.embedding_grads1 = Tensor( None, np.random.rand(3 * dim).reshape((3, dim)).astype(np.float32), np.asarray([2, 2, 3]), ) push_model_req = elasticdl_pb2.Model() push_model_req.version = self._parameters.version for name, value in zip(self.var_names, self.var_values): serialize_ndarray(value, push_model_req.dense_parameters[name]) push_model_req.embedding_table_infos.append(self._embedding_info) self._stub.push_model(push_model_req) for name, var in zip(self.var_names, self.var_values): self._parameters.non_embedding_params[name] = tf.Variable(var) self._parameters.embedding_params[self._embedding_info.name].set( range(len(self.embedding_table)), self.embedding_table )
def report_gradient_to_ps(self, grads): self._timing.start_record_time("report_gradient") reqs = [ elasticdl_pb2.PushGradientsRequest() for i in range(self._ps_num) ] ps_grads = {} non_embed_vars_n = len(self._non_embed_vars) for g, v in zip( grads[:non_embed_vars_n], self._non_embed_vars.values() ): ps_id = self._var_to_ps[v.name] if ps_id not in ps_grads: ps_grads[ps_id] = {v.name: g} else: if v.name not in ps_grads[ps_id]: ps_grads[ps_id][v.name] = g else: if isinstance(g, tf.IndexedSlices): ps_grads[ps_id][v.name] = merge_indexed_slices( ps_grads[ps_id][v.name], g ) else: ps_grads[ps_id][v.name] += g for ps_id, pair in ps_grads.items(): for name, g in pair.items(): if isinstance(g, tf.IndexedSlices): v, i = deduplicate_indexed_slices(g.values, g.indices) ps_grads[ps_id][name] = tf.IndexedSlices(v, i) for ps_id in ps_grads: req = reqs[ps_id] for name, g in ps_grads[ps_id].items(): # Keras embedding layer has a dense parameter, # but an indexed slices type gradient if isinstance(g, tf.IndexedSlices): serialize_indexed_slices( Tensor(None, g.values.numpy(), g.indices.numpy()), req.gradients.embedding_tables[name], ) else: serialize_ndarray( g.numpy(), req.gradients.dense_parameters[name] ) edl_embedding_name_values = self._collect_edl_embedding_name_values() if edl_embedding_name_values: edl_embedding_grads = grads[non_embed_vars_n:] bet_number = 0 for name, embedding_and_ids in edl_embedding_name_values: bet_number += len(embedding_and_ids) if len(edl_embedding_grads) != bet_number: raise ValueError( "elasticdl.layers.embedding related gradient number %d " "does not match the number of its output tensor %d." % (len(edl_embedding_grads), bet_number) ) grad_accum_iter = 0 for name, embedding_and_ids in edl_embedding_name_values: g_values = None g_indices = None for _, ids in embedding_and_ids: grad = edl_embedding_grads[grad_accum_iter] grad_accum_iter += 1 # ElasticDL embedding layer with Sparse Gradients if isinstance(grad, tf.IndexedSlices): grad = grad.values if g_values is not None: g_values = tf.concat([g_values, grad], axis=0) g_indices = tf.concat([g_indices, ids], axis=0) else: g_values = grad g_indices = ids # Sum up the values of the duplicated indices in the # gradients. It can reduce the gradient payload of the # dense embedding. g_values, g_indices = deduplicate_indexed_slices( values=g_values, indices=g_indices ) results = scatter_embedding_vector( g_values.numpy(), g_indices.numpy(), self._ps_num ) for ps_id in results: req = reqs[ps_id] gv, gi = results[ps_id] serialize_indexed_slices( Tensor(None, gv, gi), req.gradients.embedding_tables[name], ) report_futures = [] for ps_id in range(self._ps_num): req = reqs[ps_id] req.gradients.version = self._model_versions_from_ps[ps_id] req.learning_rate = K.get_value(self._model.optimizer.lr) report_future = self._ps_stubs[ps_id].push_gradients.future(req) report_futures.append(report_future) accepted = False max_version = -1 for report_future in report_futures: res = report_future.result() if res.accepted: accepted = True if res.version > max_version: max_version = res.version self._timing.end_record_time("report_gradient") return accepted, max_version
def push_gradients(self, request, _): res = elasticdl_pb2.PushGradientsResponse() if self._use_async: grad_vars = [] for name, pb in request.gradients.dense_parameters.items(): grad = pb_to_ndarray(pb) self._parameters.check_grad(Tensor(name, grad, None)) grad = tf.constant(grad) var = self._parameters.get_non_embedding_param(name) grad_vars.append((grad, var)) for name, pb in request.gradients.embedding_tables.items(): grad = pb_to_indexed_slices(pb) self._parameters.check_grad( Tensor(name, grad.values, grad.indices)) if name in self._parameters.non_embedding_params: var = self._parameters.get_non_embedding_param(name) grad_vars.append((grad, var)) else: grad_vars.append((grad, name)) learning_rate = request.learning_rate # TODO: if request.learning_rate == 0.0, modulate learning_rate # in self._optimizer with staleness if self._lr_staleness_modulation and learning_rate > 0.0: staleness = max( 1, self._parameters.version - request.gradients.version) # Modulate learning rate by staleness learning_rate /= staleness self._set_optimizer_learning_rate(learning_rate) self._optimizer.apply_gradients(grad_vars) with self._version_lock: self._parameters.version += 1 self._save_params_to_checkpoint_if_needed() version = self._parameters.version self._report_version_if_needed(version) res.accepted = True res.version = self._parameters.version return res else: if (request.gradients.version < self._parameters.version - self._sync_version_tolerance): res.accepted = False res.version = self._parameters.version return res with self._lock: for name, pb in request.gradients.dense_parameters.items(): grad = pb_to_ndarray(pb) self._parameters.check_grad(Tensor(name, grad, None)) if name in self._grads_buffer: self._grads_buffer[name] = (self._grads_buffer[name] + grad) else: self._grads_buffer[name] = grad for name, pb in request.gradients.embedding_tables.items(): grad = pb_to_indexed_slices(pb) self._parameters.check_grad( Tensor(name, grad.values, grad.indices)) if name in self._grads_buffer: self._grads_buffer[name] = merge_indexed_slices( self._grads_buffer[name], grad) else: self._grads_buffer[name] = grad self._grads_n += 1 res.accepted = True updated_version = False version = self._parameters.version if self._grads_n == self._grads_to_wait: grad_vars = [] for name, grad in self._grads_buffer.items(): # Dense gradients are averaged, # while sparse gradients are summed if not isinstance(grad, tf.IndexedSlices): grad = grad / self._grads_to_wait grad = tf.constant(grad) var = self._parameters.get_non_embedding_param(name) if var is None: grad_vars.append((grad, name)) else: grad_vars.append((grad, var)) self._set_optimizer_learning_rate(request.learning_rate) self._optimizer.apply_gradients(grad_vars) self._grads_n = 0 self._grads_buffer.clear() self._parameters.version += 1 self._save_params_to_checkpoint_if_needed() version = self._parameters.version updated_version = True if updated_version: self._report_version_if_needed(version) res.version = version return res
def push_gradients( self, grads, edl_grads, learning_rate, model_versions, ): """ Push gradients to PS. There two kinds of gradients: - gradients of normal layers - sparse gradients of ElasticDL embedding layers """ reqs = [ elasticdl_pb2.PushGradientsRequest() for i in range(self.ps_num) ] ps_grads = {} # 1. handle grads for grad in grads: ps_id = self.parameter_to_ps[grad.name] if ps_id not in ps_grads: ps_grads[ps_id] = {grad.name: grad} else: if grad.name not in ps_grads[ps_id]: ps_grads[ps_id][grad.name] = grad else: if grad.indices is not None: ps_grads[ps_id][grad.name] = merge_indexed_slices( ps_grads[ps_id][grad.name], grad ) else: ps_grads[ps_id][grad.name].values += grad.values for ps_id, pair in ps_grads.items(): for name, grad in pair.items(): if grad.indices is not None: v, i = deduplicate_indexed_slices( grad.values, grad.indices ) ps_grads[ps_id][name] = Tensor(None, v, i) for ps_id in ps_grads: req = reqs[ps_id] for name, grad in ps_grads[ps_id].items(): # Keras embedding layer has a dense parameter, # but an indexed slices type gradient if grad.indices is not None: serialize_indexed_slices( Tensor(None, grad.values, grad.indices), req.gradients.embedding_tables[name], ) else: serialize_ndarray( grad.values, req.gradients.dense_parameters[name] ) # 2. handle sparse grads of elasticdl embedding layers groups = {} for grad in edl_grads: if grad.name not in groups: groups[grad.name] = grad else: groups[grad.name] = merge_indexed_slices( groups[grad.name], grad ) # Sum up the values of the duplicated indices in the # gradients. It can reduce the gradient payload of the # dense embedding. for name, grad in groups.items(): v, i = deduplicate_indexed_slices(grad.values, grad.indices) groups[name] = Tensor(None, v, i) results = scatter_embedding_vector( groups[name].values, groups[name].indices, self.ps_num ) for ps_id in results: req = reqs[ps_id] gv, gi = results[ps_id] serialize_indexed_slices( Tensor(None, gv, gi), req.gradients.embedding_tables[name], ) # 3. push gradients to PS report_futures = [] for ps_id in range(self.ps_num): req = reqs[ps_id] req.gradients.version = model_versions[ps_id] req.learning_rate = learning_rate report_future = self.ps_stubs[ps_id].push_gradients.future(req) report_futures.append(report_future) accepted = False max_version = -1 for report_future in report_futures: res = report_future.result() if res.accepted: accepted = True if res.version > max_version: max_version = res.version return accepted, max_version