def rebuild_graph(self, path, model_name, full_assign=False, train_data=None): if train_data is None: raise ValueError("SVDpp model must provide train_data " "when rebuilding graph") sparse_implicit_interaction = sparse_tensor_interaction(train_data, recent_num=10) self._build_model(sparse_implicit_interaction) self._build_train_ops() variable_path = os.path.join(path, f"{model_name}_variables.npz") variables = np.load(variable_path) variables = dict(variables.items()) (user_variables, item_variables, sparse_variables, dense_variables, manual_variables) = modify_variable_names(self, trainable=True) update_ops = [] for v in tf.trainable_variables(): if user_variables is not None and v.name in user_variables: # no need to remove oov values old_var = variables[v.name] user_op = tf.IndexedSlices(old_var, tf.range(len(old_var))) update_ops.append(v.scatter_update(user_op)) if item_variables is not None and v.name in item_variables: old_var = variables[v.name] item_op = tf.IndexedSlices(old_var, tf.range(len(old_var))) update_ops.append(v.scatter_update(item_op)) if full_assign: (optimizer_user_variables, optimizer_item_variables, optimizer_sparse_variables, optimizer_dense_variables, _) = modify_variable_names(self, trainable=False) other_variables = [ v for v in tf.global_variables() if v.name not in manual_variables ] for v in other_variables: if (optimizer_user_variables is not None and v.name in optimizer_user_variables): old_var = variables[v.name] user_op = tf.IndexedSlices(old_var, tf.range(len(old_var))) update_ops.append(v.scatter_update(user_op)) elif (optimizer_item_variables is not None and v.name in optimizer_item_variables): old_var = variables[v.name] item_op = tf.IndexedSlices(old_var, tf.range(len(old_var))) update_ops.append(v.scatter_update(item_op)) else: old_var = variables[v.name] update_ops.append(v.assign(old_var)) self.sess.run(update_ops)
def apply_gradients(self, grads_and_vars, global_step=None, name=None): with tf.init_scope(): self._create_slots([v for (_, v) in grads_and_vars]) accums = [] variables = [] for g, v in grads_and_vars: accum = self.get_slot(v, 'grad_accum') variables.append(v) if isinstance(g, tf.IndexedSlices): scaled_grad = tf.IndexedSlices( g.values / self._grad_steps, g.indices, dense_shape=g.dense_shape) accums.append(accum.assign_add(scaled_grad)) # pytype: disable=attribute-error else: accums.append(accum.assign_add(g / self._grad_steps)) # pytype: disable=attribute-error def _apply_and_zero(): apply_op = self._opt.apply_gradients(list(zip(accums, variables))) with tf.control_dependencies([apply_op]): zero_op = [tf.assign(accum, tf.zeros_like(accum)) for accum in accums] return tf.group(zero_op, tf.assign_add(self._counter, 1)) def _accum(): return tf.group(accums) accum_step = tf.cond( tf.equal(tf.mod(global_step, self._grad_steps), self._grad_steps - 1), _apply_and_zero, _accum) with tf.control_dependencies([accum_step]): global_step = tf.assign_add(global_step, 1) return tf.group(global_step)
def _get_model_spec(self, features, labels, mode): features = features.copy() if mode == tf.estimator.ModeKeys.PREDICT: fids = tf.IndexedSlices( indices=features.pop('fids_indices'), values=features.pop('fids_values'), dense_shape=features.pop('fids_dense_shape')) features.update(self._preprocess_fids( fids, self._slot_configs)) bias_embedding = embedding.Embedding(self._bias_slot_configs, devices=self._embedding_devices) bias_tensor = bias_embedding.lookup(features) if self._vec_slot_configs is not None: vec_embedding = embedding.Embedding(self._vec_slot_configs, devices=self._embedding_devices) vec_tensor = vec_embedding.lookup(features) else: vec_embedding = None vec_tensor = None model = SparseFLModel(self._role, self._bridge, features.get('example_id', None), config_run=False, bias_tensor=bias_tensor, bias_embedding=bias_embedding, vec_tensor=vec_tensor, vec_embedding=vec_embedding, feature_columns=self._feature_columns) spec = self._model_fn(model, features, labels, mode) assert model._frozen, "Please finalize model in model_fn" return spec, model
def matmul_sparse_dense(A, B, name=None, transpose_a=False, transpose_b=False): # pylint: disable=invalid-name """Computes matmul(A, B) where A is sparse, B is dense. Args: A: tf.IndexedSlices with dense shape [m, n]. B: tf.Tensor with shape [n, k]. name: str. Name of op. transpose_a: Bool. If true we transpose A before multiplying it by B. (Default: False) transpose_b: Bool. If true we transpose B before multiplying it by A. (Default: False) Returns: tf.IndexedSlices resulting from matmul(A, B). Raises: ValueError: If A doesn't represent a matrix. ValueError: If B is not rank-2. """ with tf.name_scope(name, "matmul_sparse_dense", [A, B]): if A.indices.shape.ndims != 1 or A.values.shape.ndims != 2: raise ValueError("A must represent a matrix. Found: %s." % A) if B.shape.ndims != 2: raise ValueError("B must be a matrix.") new_values = tf.matmul( A.values, B, transpose_a=transpose_a, transpose_b=transpose_b) return tf.IndexedSlices( new_values, A.indices, dense_shape=tf.stack([A.dense_shape[0], new_values.shape[1]]))
def gradients_assign_add(ref, value): if isinstance(ref, tf.IndexedSlices): indices = ref.indices values = ref.values + value.values return tf.IndexedSlices(values, indices, ref.dense_shape) else: return ref + value
def gradients(self, objective, parameters): """Compute gradients of the objective with respect to the parameters. Args: objective: The objective op (e.g. output of self.objective()) parameters: A list of tensors (the parameters to optimize) Returns: A list of tensors representing the gradient for each parameter, returned in the same order as the given list """ grads = tf.gradients(objective, list(parameters)) noisy_grads = [] for grad in grads: if isinstance(grad, tf.IndexedSlices): noise = self.noise_stdev * tf.random_normal( tf.shape(grad.values)) new_grad = tf.IndexedSlices(grad.values + noise, grad.indices) else: new_grad = grad + self.noise_stdev * tf.random_normal( grad.get_shape()) noisy_grads.append(new_grad) return noisy_grads
def _preprocess_fids(self, fids, configs): if fids.indices.shape.rank == 2: fids = tf.IndexedSlices(indices=fids.indices[:, 0], values=fids.values, dense_shape=fids.dense_shape) features = {} for config in configs: features.update(operator._multidevice_preprocess_fids( fids, config, num_shards=self._num_shards)) return features
def average_gradient(tower_grads): avg_grads = [] for grads_vars in zip(*tower_grads): values = tf.concat([g.values / num_gpus for g, _ in grads_vars], 0) indices = tf.concat([g.indices for g, _ in grads_vars], 0) grad = tf.IndexedSlices(values, indices) var = grads_vars[0][1] cur_grad_and_var = (grad, var) avg_grads.append(cur_grad_and_var) return avg_grads
def _resource_apply_sparse_duplicate_indices(self, grad, var, indices): tf.logging.warning("MultistepAdamOptimizer does not support sparse updates") # Note that conversion to a dense Tensor handles duplicate `indices` # correctly (summing them). A real sparse implementation will probably want # to override _resource_apply_sparse instead so it gets them de-duplicated # automatically. dense_grad = tf.convert_to_tensor( tf.IndexedSlices( values=grad, indices=indices, dense_shape=tf.shape(var))) return self._apply_cond(self._resource_apply_dense_in_action, dense_grad, var)
def average_sparse(grad_and_vars): if len(grad_and_vars) == 1: return grad_and_vars[0][0] indices = [] values = [] for g, _ in grad_and_vars: indices += [g.indices] values += [g.values] indices = tf.concat(indices, 0) values = tf.concat(values, 0) / len(grad_and_vars) return tf.IndexedSlices(values, indices, grad_and_vars[0][0].dense_shape)
def apply_gradients(self, grads_and_vars, global_step=None, name=None): counter = tf.get_variable(shape=[], initializer=tf.zeros_initializer, name="counter") accums = [] update_op = [] variables = [] for (grad, param) in grads_and_vars: if grad is None or param is None: continue if self._grad_clipping is not None: grad_clipping = self._steps * self._grad_clipping grad = tf.clip_by_value(grad, -grad_clipping, grad_clipping) variables.append(param) param_name = self._get_variable_name(param.name) accum = tf.get_variable(name=param_name + "/accum", shape=param.shape.as_list(), dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) accums.append(accum) if isinstance(grad, tf.IndexedSlices): scaled_grad = tf.IndexedSlices(grad.values / self._steps, grad.indices, dense_shape=grad.dense_shape) update_op.append(accum.assign_add(scaled_grad)) else: update_op.append(accum.assign_add(grad / self._steps)) def _apply_and_zero(): with tf.control_dependencies(update_op): apply_op = self._opt.apply_gradients( list(zip(accums, variables)), global_step, name) with tf.control_dependencies([apply_op]): zero_op = [ tf.assign(accum, tf.zeros_like(accum)) for accum in accums + [counter] ] return tf.group(zero_op) def _accum(): return tf.group(update_op) # Control that the counter has been incremented already with tf.control_dependencies([counter.assign_add(1)]): return tf.cond(tf.equal(tf.mod(counter, self._steps), 0), _apply_and_zero, _accum)
def splice(obj, input_map, control_inputs=None): if type(obj) is tf.Operation: return splice_op(obj, input_map, control_inputs=control_inputs) elif type(obj) is tf.Tensor: return splice_tensor(obj, input_map.get(obj.op, obj.op)) elif type(obj) is tf.IndexedSlices: return tf.IndexedSlices(values=input_map.get(obj.values, obj.values), indices=input_map.get(obj.indices, obj.indices), dense_shape=input_map.get( obj.dense_shape, obj.dense_shape)) else: raise AssertionError( f'Could not get deps from{repr(type(obj))} {repr(obj)}')
def testSparseUpdates(self): """Test that checks sparse updates.""" with self.cached_session() as sess: var = tf.Variable([[0.5, 0.05], [0.05, 1.0], [0.15, 3.0], [0.35, 2.0]]) # A sparse gradient that updates index 1, and 3. grad_np = [[0.1, 0.05], [0.01, 1.5]] indices_np = [1, 3] shape = [2, 2] grad = tf.IndexedSlices( tf.constant(grad_np, shape=shape), tf.constant(indices_np), # indices tf.constant(shape)) # shape opt = sm3.SM3Optimizer(learning_rate=self._learning_rate, momentum=self._momentum) step = opt.apply_gradients([(grad, var)]) sess.run(tf.global_variables_initializer()) # Check that variable and momentum are as expected before starting # training. var_np = sess.run(var) self.assertAllClose( [[0.5, 0.05], [0.05, 1.0], [0.15, 3.0], [0.35, 2.0]], var_np) # Run one step of training. step.run() accumulator = numpy.zeros_like(var_np) accumulator[indices_np, :] += numpy.square(grad_np) row_accumulator = numpy.amax(accumulator, axis=1, keepdims=True) # Update SM3 accumulators. exp_p_grad = grad_np / numpy.sqrt(accumulator[indices_np, :]) exp_var_np = var_np exp_var_np[indices_np, :] = var_np[ indices_np, :] - self._learning_rate * exp_p_grad var_np = sess.run(var) self.assertAllClose(exp_var_np, var_np) row_accumulator_var = numpy.reshape( sess.run(opt.get_slot(var, 'accumulator_0')), [4, 1]) self.assertAllClose(row_accumulator_var, row_accumulator)
def _embedding_pooling_gradient(op, grad): num_weights = op.get_attr("num_weights") control_inputs = op.control_inputs def _get_control_input_by_name(name): candidates = [x for x in control_inputs if x.name.find(name) != -1] assert len(candidates) == 1 return candidates[0].outputs[0] num_unique_fids_per_partition = _get_control_input_by_name( 'num_unique_fids_per_partition') fid_to_unique_index = _get_control_input_by_name('fid_to_unique_index') unique_fid_hash = [_get_control_input_by_name('unique_fid_hash_%d'%i) \ for i in range(num_weights)] assert len(unique_fid_hash) == num_weights values = lagrange_lite_ops.lagrange_embedding_unpooling( num_weights=num_weights, weight_sizes=op.get_attr('weight_sizes'), use_fid_v2=op.get_attr('use_fid_v2'), output_grad=grad, instance_ids=op.inputs[1], fids=op.inputs[2], fid_to_unique_index=fid_to_unique_index, num_unique_fids_per_partition=num_unique_fids_per_partition, slot_size=op.inputs[3], slot_weight_index=op.inputs[4], slot_output_offset=op.inputs[5]) weight_grads = [] for i, (k, v) in enumerate(zip(unique_fid_hash, values)): w = op.inputs[8 + i] shape = tf.shape(w, out_type=tf.int64) weight_grads.append( tf.IndexedSlices(indices=k, values=v, dense_shape=shape)) return [None for i in range(8)] + weight_grads
def matmul_diag_sparse(A_diag, B, name=None): # pylint: disable=invalid-name """Computes matmul(A, B) where A is a diagonal matrix, B is sparse. Args: A_diag: diagonal entries of matrix A of shape [m, m]. B: tf.IndexedSlices. Represents matrix of shape [m, n]. name: str. Name of op. Returns: tf.IndexedSlices resulting from matmul(A, B). Raises: ValueError: If A_diag is not rank-1. ValueError: If B doesn't represent a matrix. """ with tf.name_scope(name, "matmul_diag_sparse", [A_diag, B]): A_diag = tf.convert_to_tensor(A_diag) if A_diag.shape.ndims != 1: raise ValueError("A_diag must be a rank-1 Tensor.") if B.indices.shape.ndims != 1 or B.values.shape.ndims != 2: raise ValueError("B must represent a matrix. Found: %s." % B) a = tf.gather(A_diag, B.indices) a = tf.reshape(a, list(a.shape) + [1] * (B.values.shape.ndims - 1)) return tf.IndexedSlices(a * B.values, B.indices, dense_shape=B.dense_shape)
def tgn_memory( n_nodes: int, memory_size: int, time_embedding_size: int, node_ids: tf.Tensor, write_idx: tf.Tensor, write_mask: tf.Tensor, write_features: tf.Tensor, write_times: tf.Tensor, ) -> TgnMemory: """Create TGN memory read & update operations. A trainable memory for nodes in an temporal interaction graph. The memory state is computed using the latest interaction event that touched a node. The update is a GRU cell, taking as input the previous memory of both source and desination nodes for that edge, the edge feature vector and time difference from interaction to current time. Note that the GRU cell is computed lazily when the memory is read, rather than when it is stored, to support a single step of truncated backpropagation through time and obtain a gradient for GRU variables. Please see "Temporal Graph Network" (https://arxiv.org/abs/2006.10637) for full details. Arguments: n_nodes -- total number of slots in the memory memory_size -- size of stored state in the memory / GRU cell output size time_embedding_size -- size of the time encoding activation provided to the GRU cell node_ids -- shape (n_read), (-1 <= ID < n_nodes), the memory locations to be read write_idx -- shape (2, n_write), (0 <= idx < n_read), the (src, dst) indices of edges, selecting nodes that should be written with their updated memory state write_mask -- shape (2, n_write), boolean tensor for elements in write_idx that should be written (true) or skipped (false), such that each memory location is written at most once write_features -- shape (n_write, feature_size), input features to be stored and used to compute the memory when it is next accessed write_times -- shape (n_write), edge event times to be stored and used to compute the memory when it next accessed Returns: TgnMemory( output -- tensor of shape (n_read, memory_size), current memory for node_ids last_update -- tensor of shape (n_read), last update of output updates -- tuple of operations to run to update the memory ) """ assert_shape(node_ids, (None, )) _, n_write = assert_shape(write_idx, (2, None)) assert_shape(write_mask, (2, n_write)) _, feature_size = assert_shape(write_features, (n_write, None)) assert_shape(write_times, (n_write, )) dtype = write_features.dtype # Declare memory # As an optimisation, we concatenate the 6 fields required by the memory # into 2 tensors, one consisting of ints, the other of floats. # This requires some extra code to slice and concat, but means we can use # 2 (dynamic) gather operations instead of 6. # Each row: [last_update, dt, neighbour] v_ints = tf.get_variable( "ints", shape=(1 + n_nodes, 3), dtype=tf.int32, trainable=False, initializer=tf.zeros_initializer(), collections=[tf.GraphKeys.GLOBAL_VARIABLES, TGN_MEMORY_VARIABLES_KEY], ) # Each row: [memory, features, direction] v_floats = tf.get_variable( "floats", shape=(1 + n_nodes, memory_size + feature_size + 2), dtype=dtype, trainable=False, initializer=tf.zeros_initializer(), collections=[tf.GraphKeys.GLOBAL_VARIABLES, TGN_MEMORY_VARIABLES_KEY], ) # Memory[0] is used for padding (node_ids == -1) safe_node_ids = 1 + node_ids # Read memory for node_ids node_ints = tf.gather(v_ints, safe_node_ids) node_last_update, node_dt, node_neighbour_idx = tf.unstack(node_ints, axis=1) node_neighbour = tf.gather(v_floats[:, :memory_size], node_neighbour_idx) node_time_encoding = time_encoder(tf.cast(node_dt, tf.float32), time_embedding_size, dtype) node_floats = tf.gather(v_floats, safe_node_ids) node_self = node_floats[:, :memory_size] node_features = node_floats[:, memory_size:memory_size + feature_size] node_direction = node_floats[:, memory_size + feature_size:] node_memory = gru_cell( node_self, tf.concat( [ node_direction[:, 0, tf.newaxis] * node_self + node_direction[:, 1, tf.newaxis] * node_neighbour, node_direction[:, 1, tf.newaxis] * node_self + node_direction[:, 0, tf.newaxis] * node_neighbour, node_features, node_time_encoding, ], axis=1, ), ) # Write memory according to (write_idx, write_mask) flat_write_idx = tf.reshape(write_idx, (-1, )) indices = tf.gather(safe_node_ids, flat_write_idx) masked_indices = indices * tf.cast(tf.reshape(write_mask, (-1, )), indices.dtype) p_last_update = tf.reshape(tf.tile(write_times[tf.newaxis], (2, 1)), (-1, )) p_dt = p_last_update - tf.gather(node_last_update, flat_write_idx) # Swap src and dst indices to get the neighbour index for each node p_neighbour = tf.roll(indices, n_write, 0) p_memory = tf.gather(node_memory, flat_write_idx) p_features = tf.tile(write_features, (2, 1)) p_direction = tf.repeat(tf.eye(2, dtype=dtype), n_write, 0) # src=[1, 0], dst=[0, 1] # There is already a data dependency, but just to be sure... with tf.control_dependencies([node_last_update, node_memory]): update_ints = v_ints.scatter_update( tf.IndexedSlices( tf.stack([p_last_update, p_dt, p_neighbour], axis=1), masked_indices)) update_floats = v_floats.scatter_update( tf.IndexedSlices( tf.concat([p_memory, p_features, p_direction], axis=1), masked_indices)) return TgnMemory( output=node_memory, last_update=node_last_update, updates=(update_ints, update_floats), )
def freelb(model_fn, inputs, batch_size, max_length, optimizer=None, layer_name='word_embeddings', epsilon=0.3, n_loop=3): with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): model_outputs = model_fn(inputs, True) grads_and_vars = utils.compute_gradients(model_outputs['loss'], optimizer) # loss对embedding的梯度 embedding_gradients, embeddings = utils.find_grad_and_var( grads_and_vars, layer_name) init_r = tf.get_variable( 'init_r', shape=[batch_size * max_length, embeddings.shape.as_list()[-1]], initializer=tf.random_uniform_initializer(minval=-epsilon, maxval=epsilon), trainable=False) init_op = tf.variables_initializer([init_r]) with tf.control_dependencies([init_op]): # fix perturbation # Scale randomly initialized permutation, to make sure norm # of `r` is smaller than epsilon. r = tf.divide(init_r, tf.norm(init_r, np.inf)) r = tf.IndexedSlices(values=r, indices=embedding_gradients.indices, dense_shape=embedding_gradients.dense_shape) attack_op = embeddings.assign(embeddings + r) # attack acc_r = r all_grads_and_vars = [] for k in range(n_loop): with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE), tf.control_dependencies( [attack_op]): adv_outputs = model_fn(inputs, True) attack_grad_and_vars = utils.compute_gradients( adv_outputs['loss'], optimizer) all_grads_and_vars.append(attack_grad_and_vars) gradients, _ = utils.find_grad_and_var(attack_grad_and_vars, layer_name) tmp_r = tf.multiply(1 / n_loop, gradients / (tf.norm(gradients) + 1e-9)) # In order not to shuffle the distribution of gradient- # induced perturbation, we use norm to scale instead of # simply clip the values. norm = tf.norm(acc_r + tmp_r) cur_r = tf.cond(norm > epsilon, lambda: (acc_r + tmp_r) * tf.divide(epsilon, norm), lambda: (acc_r + tmp_r)) r = cur_r - acc_r # calculate current step attack_op = embeddings.assign(embeddings + r) acc_r = cur_r # restore with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE), tf.control_dependencies( [attack_op]): attack_outputs = model_fn(inputs, True) attack_grad_and_vars = utils.compute_gradients(attack_outputs['loss'], optimizer) all_grads_and_vars.append(attack_grad_and_vars) restore_op = embeddings.assign(embeddings - acc_r) # sum up with tf.control_dependencies([restore_op]): grads_and_vars = utils.average_grads_and_vars(all_grads_and_vars) return AdversarialOutput(model_outputs, grads_and_vars)
def _clip_by_global_norm(t_list, clip_norm, use_norm, name=None): """Clips values of multiple tensors by the ratio of the sum of their norms. Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`, this operation returns a list of clipped tensors `list_clipped` and the global norm (`global_norm`) of all tensors in `t_list`. The global norm is expected to be pre-computed and passed as use_norm. To perform the clipping, the values `t_list[i]` are set to: t_list[i] * clip_norm / max(global_norm, clip_norm) where: global_norm = sqrt(sum([l2norm(t)**2 for t in t_list])) If `clip_norm > global_norm` then the entries in `t_list` remain as they are, otherwise they're all shrunk by the global ratio. Any of the entries of `t_list` that are of type `None` are ignored. This is the correct way to perform gradient clipping (for example, see [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063) ([pdf](http://arxiv.org/pdf/1211.5063.pdf))). However, it is slower than `clip_by_norm()` because all the parameters must be ready before the clipping operation can be performed. Args: t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio. use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global norm to use. If not provided, `global_norm()` is used to compute the norm. name: A name for the operation (optional). Returns: list_clipped: A list of `Tensors` of the same type as `list_t`. global_norm: A 0-D (scalar) `Tensor` representing the global norm. Raises: TypeError: If `t_list` is not a sequence. """ if not isinstance(t_list, collections.Sequence) or isinstance( t_list, six.string_types): raise TypeError('t_list should be a sequence') t_list = list(t_list) # Removed as use_norm should always be passed # if use_norm is None: # use_norm = global_norm(t_list, name) with tf.name_scope(name, 'clip_by_global_norm', t_list + [clip_norm]) as name: # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm scale = clip_norm * tf.minimum( 1.0 / use_norm, tf.ones([1], dtype=use_norm.dtype) / clip_norm) values = [ tf.cast( tf.convert_to_tensor( t.values if isinstance(t, tf.IndexedSlices) else t, name='t_%d' % i, ), dtype=tf.float32, ) if t is not None else t for i, t in enumerate(t_list) ] values_clipped = [] for i, v in enumerate(values): if v is None: values_clipped.append(None) else: with tf.colocate_with(v): values_clipped.append( tf.identity(v * scale, name='%s_%d' % (name, i))) list_clipped = [ tf.IndexedSlices(c_v, t.indices, t.dense_shape) if isinstance( t, tf.IndexedSlices) else c_v for (c_v, t) in zip(values_clipped, t_list) ] return list_clipped, use_norm
def get_train_ops(loss, tf_variables, train_step, clip_mode=None, grad_bound=None, l2_reg=1e-4, lr_warmup_val=None, lr_warmup_steps=100, lr_init=0.1, lr_dec_start=0, lr_dec_every=10000, lr_dec_rate=0.1, lr_dec_min=None, lr_cosine=False, lr_max=None, lr_min=None, lr_T_0=None, lr_T_mul=None, num_train_batches=None, optim_algo=None, sync_replicas=False, num_aggregate=None, num_replicas=None, get_grad_norms=False, moving_average=None): """ Args: clip_mode: "global", "norm", or None. moving_average: store the moving average of parameters """ if l2_reg > 0: l2_losses = [] for var in tf_variables: l2_losses.append(tf.reduce_sum(var**2)) l2_loss = tf.add_n(l2_losses) loss += l2_reg * l2_loss # loss = loss + 1e-4*l2_loss grads = tf.gradients(loss, tf_variables) grad_norm = tf.global_norm(grads) grad_norms = {} for v, g in zip(tf_variables, grads): if v is None or g is None: continue if isinstance(g, tf.IndexedSlices): grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g.values**2)) else: grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g**2)) if clip_mode is not None: assert grad_bound is not None, "Need grad_bound to clip gradients." if clip_mode == "global": grads, _ = tf.clip_by_global_norm(grads, grad_bound) elif clip_mode == "norm": clipped = [] for g in grads: if isinstance(g, tf.IndexedSlices): c_g = tf.clip_by_norm(g.values, grad_bound) c_g = tf.IndexedSlices(g.indices, c_g) else: c_g = tf.clip_by_norm(g, grad_bound) clipped.append(g) grads = clipped else: raise NotImplementedError("Unknown clip_mode {}".format(clip_mode)) if lr_cosine: assert lr_max is not None, "Need lr_max to use lr_cosine" assert lr_min is not None, "Need lr_min to use lr_cosine" assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine" assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine" assert num_train_batches is not None, ("Need num_train_batches to use" " lr_cosine") curr_epoch = train_step // num_train_batches # train step will be calculated by just one batch! last_reset = tf.Variable(0, dtype=tf.int32, trainable=False, name="last_reset") T_i = tf.Variable(lr_T_0, dtype=tf.int32, trainable=False, name="T_i") T_curr = curr_epoch - last_reset def _update(): update_last_reset = tf.assign(last_reset, curr_epoch, use_locking=True) update_T_i = tf.assign(T_i, T_i * lr_T_mul, use_locking=True) with tf.control_dependencies([update_last_reset, update_T_i]): rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926 lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate)) return lr def _no_update(): rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926 lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate)) return lr learning_rate = tf.cond(tf.greater_equal(T_curr, T_i), _update, _no_update) else: learning_rate = tf.train.exponential_decay( lr_init, tf.maximum(train_step - lr_dec_start, 0), lr_dec_every, lr_dec_rate, staircase=True) if lr_dec_min is not None: learning_rate = tf.maximum(learning_rate, lr_dec_min) if lr_warmup_val is not None: learning_rate = tf.cond(tf.less(train_step, lr_warmup_steps), lambda: lr_warmup_val, lambda: learning_rate) if optim_algo == "momentum": opt = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=True, use_nesterov=True) elif optim_algo == "sgd": opt = tf.train.GradientDescentOptimizer(learning_rate, use_locking=True) elif optim_algo == "adam": opt = tf.train.AdamOptimizer(learning_rate, beta1=0.0, epsilon=1e-3, use_locking=True) else: raise ValueError("Unknown optim_algo {}".format(optim_algo)) if sync_replicas: assert num_aggregate is not None, "Need num_aggregate to sync." assert num_replicas is not None, "Need num_replicas to sync." opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_aggregate, total_num_replicas=num_replicas, use_locking=True) if moving_average is not None: opt = tf.contrib.opt.MovingAverageOptimizer( opt, average_decay=moving_average) train_op = opt.apply_gradients(zip(grads, tf_variables), global_step=train_step) if get_grad_norms: return train_op, learning_rate, grad_norm, opt, grad_norms else: return train_op, learning_rate, grad_norm, opt
def compute_gradients(total_loss): """Separate the function of gradient computation.""" monitor_dict = {} print(FLAGS.weight_decay, "==weight_decay==") print(FLAGS.lr_layer_decay_rate, "==lr_layer_decay_rate==") print(FLAGS.use_wd_exclusion, "==use_wd_exclusion==") print(FLAGS.adam_correction, "==adam_correction==") ##### Configure optimizer global_step = tf.train.get_or_create_global_step() # Warmup the learning rate linearly if FLAGS.warmup_steps > 0: progress = (tf.cast(global_step, tf.float32) / tf.cast(FLAGS.warmup_steps, tf.float32)) else: progress = 1.0 curr_ratio = progress + (1.0 - progress) * FLAGS.min_lr_ratio warmup_lr = curr_ratio * FLAGS.learning_rate # Decay the learning rate if FLAGS.decay_method == "poly": decay_lr = tf.train.polynomial_decay( FLAGS.learning_rate, global_step=global_step - FLAGS.warmup_steps, decay_steps=FLAGS.train_steps - FLAGS.warmup_steps, end_learning_rate=FLAGS.learning_rate * FLAGS.min_lr_ratio) elif FLAGS.decay_method == "cos": decay_lr = tf.train.cosine_decay( FLAGS.learning_rate, global_step=global_step - FLAGS.warmup_steps, decay_steps=FLAGS.train_steps - FLAGS.warmup_steps, alpha=FLAGS.min_lr_ratio) else: raise ValueError(FLAGS.decay_method) learning_rate = tf.where(global_step < FLAGS.warmup_steps, warmup_lr, decay_lr) if (FLAGS.weight_decay > 0 and not FLAGS.use_tpu and FLAGS.num_core_per_host > 1): raise ValueError("Do not support `weight_decay > 0` with multi-gpu " "training so far.") if FLAGS.use_wd_exclusion: exclude_from_weight_decay = ["LayerNorm", "layer_norm", "bias"] else: exclude_from_weight_decay = [] print(exclude_from_weight_decay, "==exclude_from_weight_decay==") optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, beta_1=FLAGS.adam_beta1, beta_2=FLAGS.adam_beta2, epsilon=FLAGS.adam_epsilon, bias_correction=FLAGS.adam_correction, exclude_from_weight_decay=exclude_from_weight_decay, weight_decay_rate=FLAGS.weight_decay) if FLAGS.use_tpu: if FLAGS.per_core_clip: optimizer = tpu_optimizer.CrossShardOptimizer( optimizer, skip_nan_grad=FLAGS.skip_nan_grad) else: optimizer = tpu_optimizer.CrossShardOptimizer( optimizer, skip_nan_grad=FLAGS.skip_nan_grad, clip=FLAGS.clip) ##### Compute gradient variables = tf.trainable_variables() gradients = tf.gradients(total_loss, variables) if FLAGS.clip > 0 and FLAGS.per_core_clip: tf.logging.info("Clip local gradient with norm %.3f.", FLAGS.clip) clipped, local_gnorm = tf.clip_by_global_norm(gradients, FLAGS.clip) else: tf.logging.info("Do not clip local gradient.") clipped = list(gradients) local_gnorm = tf.linalg.global_norm(gradients) # layer-wise learning rate decay if FLAGS.lr_layer_decay_rate != 1.0: def _get_layer_id(name): if "model/input" in name: return 0 m = re.search(r"model/(encoder|decoder)/layer_(\d+?)/", name) if not m: return None return int(m.group(2)) + 1 n_layer = 0 for i in range(len(clipped)): layer_id = _get_layer_id(variables[i].name) if layer_id is None: continue n_layer = max(n_layer, layer_id + 1) for i in range(len(clipped)): layer_id = _get_layer_id(variables[i].name) if layer_id is not None: abs_rate = FLAGS.lr_layer_decay_rate**(n_layer - 1 - layer_id) tf.logging.info("Apply mult %.4f to the grad of %s", abs_rate, variables[i].name) if isinstance(clipped[i], tf.IndexedSlices): clipped[i] = tf.IndexedSlices(clipped[i].values * abs_rate, clipped[i].indices, clipped[i].dense_shape) else: clipped[i] *= abs_rate else: tf.logging.info("Grad of %s is not decayed.", variables[i].name) grad_and_vars = list(zip(clipped, variables)) monitor_dict["local_gnorm"] = local_gnorm monitor_dict["learning_rate"] = learning_rate return optimizer, grad_and_vars, global_step, monitor_dict
def _resource_apply_sparse(self, grad, handle, indices): return self._resource_apply_dense( tf.convert_to_tensor( tf.IndexedSlices(grad, indices, tf.shape(handle))), handle)
def apply_gradients(self, grads_and_vars, global_step=None, name=None): grad_list = [] var_list = [] for g, v in grads_and_vars: grad_list.append(g) var_list.append(v) with tf.init_scope(): self._create_slots(var_list) # accumulate gradients accums = [] for g, v in zip(grad_list, var_list): accum = self.get_slot(v, 'grad_accum') # pytype: disable=attribute-error if isinstance(g, tf.IndexedSlices): scaled_grad = tf.IndexedSlices(g.values / self._grad_steps, g.indices, dense_shape=g.dense_shape) accums.append( accum.assign( self._sharding(accum.read_value()) + scaled_grad)) else: accums.append( accum.assign( self._sharding(accum.read_value()) + g / self._grad_steps)) # pytype: enable=attribute-error if self._use_tpu: def _apply_and_zero_tpu2(): normalized_accums = accums if self._apply_crs_to_grad: normalized_accums = [ tf.tpu.cross_replica_sum(accum.read_value()) for accum in accums ] apply_op = self._opt.apply_gradients( list(zip(normalized_accums, var_list))) with tf.control_dependencies([apply_op]): zero_op = [ tf.assign(accum, tf.zeros_like(accum)) for accum in accums ] return tf.group(zero_op, tf.assign_add(global_step, 1)) def _accum_tpu2(): return tf.group(tf.no_op(), tf.assign_add(global_step, 1)) accum_step = tf.cond( tf.equal(tf.mod(self._counter, self._grad_steps), self._grad_steps - 1), _apply_and_zero_tpu2, _accum_tpu2) with tf.control_dependencies([tf.group(accums)]): return tf.group(accum_step, tf.assign_add(self._counter, 1)) # for GPUs, use merge_call outside tf.cond to avoid issues with tf.control_dependencies([tf.group(accums)]): merge_return = tf.distribute.get_replica_context().merge_call( self._maybe_apply_grads_and_zero, args=(global_step, accums, var_list)) return merge_return