def _get_coordinatewise_learning_rate(self, grad, var): # Compute the learning rate using a moving average for the diagonal of BB^T avg_first = self.get_slot(var, 'first_moment') avg_second = self.get_slot(var, 'second_moment') decay_tensor = tf.cast(self._decay_tensor, var.dtype) batch_size = tf.cast(self._batch_size_tensor, var.dtype) # Create an estimator for the moving average of gradient mean and variance # via Welford's algorithm if isinstance(grad, tf.Tensor): delta = grad - avg_first first_moment_update = avg_first.assign_add(delta * tf1.where( self.iterations < 1, tf.cast(1, var.dtype), 1. - decay_tensor)) with tf.control_dependencies([first_moment_update]): second_moment_update = avg_second.assign_add( tf.cast(self.iterations < 1, var.dtype) * -(1. - decay_tensor) * (avg_second - decay_tensor * tf.square(delta))) diag_preconditioner = distribution_util.with_dependencies( [second_moment_update], tf.clip_by_value(avg_second, 1e-12, 1e12)) elif isinstance(grad, tf.IndexedSlices): delta = grad.values - tf.gather_nd(avg_first, grad.indices) first_moment_update = tf1.scatter_add( avg_first, grad.indices, delta * tf1.where(self.iterations < 1, tf.cast(1., var.dtype), 1. - decay_tensor)) with tf.control_dependencies([first_moment_update]): avg_second = tf1.scatter_add( avg_second, grad.indices, tf.cast(self.iterations < 1, var.dtype) * -(1. - decay_tensor) * (tf.gather_nd(avg_second, grad.indices) - decay_tensor * tf.square(delta))) avg_second = tf.gather_nd(avg_second, grad.indices) # TODO(b/70783772): Needs dtype specific clipping. diag_preconditioner = tf.clip_by_value(avg_second, 1e-12, 1e12) else: raise tf.errors.InvalidArgumentError( None, None, 'grad must of type Tensor or IndexedSlice') diag_preconditioner *= batch_size if self._use_single_learning_rate: diag_preconditioner = tf.reduce_mean( input_tensor=diag_preconditioner) # From Theorem 2 Corollary 1 of Mandt et al. 2017 return 2. * batch_size / ( tf.cast(self._total_num_examples, var.dtype.base_dtype) * diag_preconditioner)
def _apply_sparse(self, grad, var): return self._apply_sparse_shared( grad.values, var, grad.indices, lambda x, i, v: tf.scatter_add( # pylint: disable=g-long-lambda x, i, v, use_locking=self._use_locking))
def _sparse_moving_average(self, x_tm1, idxs, a_t_, name, beta=.9): """ """ b_tm1 = self.get_accumulator(x_tm1, '%s' % name) b_tm1_ = tf.gather(b_tm1, idxs) shape = self.get_variable_shape(x_tm1) tm1 = self.get_accumulator(x_tm1, '%s/tm1' % name, shape=[shape[0]] + [1] * (len(shape) - 1)) tm1_ = tf.gather(tm1, idxs) t = tf.scatter_add(tm1, idxs, tf.ones_like(tm1_)) t_ = tf.gather(t, idxs) if beta < 1: beta_t = tf.convert_to_tensor(beta, name='%s/decay' % name) beta_t_ = beta_t * (1 - beta_t**tm1_) / (1 - beta_t**t_) else: beta_t_ = tm1_ / t_ b_t = tf.scatter_update(b_tm1, idxs, beta_t_ * b_tm1_) b_t = tf.scatter_add(b_t, idxs, (1 - beta_t_) * a_t_) return b_t, t
def update_contextual_features(contextual_features, indices, updates, flattened_idx_offset): first_indices, second_indices = tf.split(indices, 2, 1) indices = tf.squeeze(first_indices + second_indices) indices = indices + flattened_idx_offset contextual_features = tf.scatter_add(contextual_features, indices, updates, use_locking=None) return contextual_features
def force_ext_ellipsoid_idx_multi(links, idx): gij, ginv = links.get_metrics(idx) r = tf.gather(links.points, idx) dr2 = links.get_dr2(r, gij) A = links.amplitude dr = r - r[:, newaxis] drh = dr / (tf.norm(dr, axis=-1, keepdims=True) + 1e-15) # links.fmat0 = A*drh*((links.dr2[id]**(links.net.POW/2-1))*tf.exp(-links.dr2[id]**links.net.POW)) links.fmat0 = (A * drh * ((dr2**(links.net.POW / 2.0 - 1)) * tf.exp(-(dr2**(links.net.POW / 2.0))))) links.Force_LL_Ell = tf.reduce_sum(links.fmat0, 0) return tf.scatter_add(links.net.f_link, idx, links.Force_LL_Ell)
def force_node_repel_idx(nodes, idx): """try out different forces, long- and short-range""" r0 = tf.gather(nodes.r0, idx) th = r0 + r0[:, newaxis] A = nodes.amplitude x = tf.gather(nodes.points, idx) r = x - x[:, newaxis, :] # tf.expand_dims(x,1) rlen = vec_len(r) # fmat = A*r*tf.expand_dims((rlen/th)**(POWn-2)*tf.exp(-(rlen/th)**POWn),2) # fmat = th[:,:,newaxis]**POW_SN *A*r*tf.expand_dims((rlen/th)**(POWn-2)*tf.exp(-(rlen/th)**POWn),2) fmat = (th[:, :, newaxis]**nodes.net.POW_SN * A * r * tf.expand_dims( (rlen / th)**(nodes.net.POWn - 2) * tf.exp(-( (rlen / th)**nodes.net.POWn)), 2, )) nodes.Force_NN = tf.reduce_sum(fmat, 0) return tf.scatter_add(nodes.net.f_node, idx, nodes.Force_NN)
def _apply_sparse_shared(self, grad_values, grad_indices, var): shape = np.array(var.get_shape()) var_rank = len(shape) # For sparse case, we only update the accumulator representing the sparse # dimension. In this case SM3 is similar to isotropic adagrad but with # better bound (due to the max operator). # # We do not use the column accumulator because it will updated for # every gradient step and will significantly overestimate the gradient # square. While, the row accumulator can take advantage of the sparsity # in the gradients. Even if one implements the column accumulator - it # will result in a no-op because the row accumulators will have lower # values. # # Note that: We do not run this code paths for our experiments in our paper # as on TPU all the sparse gradients are densified. if var_rank > 1: accumulator_var = self.get_slot(var, "accumulator_" + str(0)) accumulator = tf.gather(accumulator_var, grad_indices) shape_for_broadcasting = tf.concat( [[tf.shape(accumulator)[0]], [1] * (var_rank - 1)], 0) accumulator = tf.reshape(accumulator, shape_for_broadcasting) accumulator += grad_values * grad_values else: accumulator_var = self.get_slot(var, "accumulator") accumulator = tf.scatter_add(accumulator_var, grad_indices, grad_values * grad_values) accumulator_inv_sqrt = tf.rsqrt(accumulator + 1e-30) scaled_g = (grad_values * accumulator_inv_sqrt) updates = [] with tf.control_dependencies([scaled_g]): if var_rank > 1: axes = list(range(1, var_rank)) new_accumulator = tf.reduce_max(accumulator, axis=axes) updates = [ tf.scatter_update(accumulator_var, grad_indices, new_accumulator) ] with tf.control_dependencies(updates): return tf.scatter_sub(var, grad_indices, self._learning_rate_tensor * scaled_g)
def forces_ext_brute_idx_multi(links, idx): """ Generate extra tensors for link external force calculation. it has a placeholder links.f_ext_idx_[id] for indexing, and defines a tensor links.force_ext_app[id], both unique to this each instance, """ x = tf.gather(links.points, idx) # all possible seg pairs th0 = tf.gather(links.thickness, idx) th_mat = th0 + th0[:, newaxis] A = links.amplitude links.r = x - x[:, newaxis] rlen = vec_len(links.r) # !!! must exclude pairs on same edge, otherwise edge won't contract fmat = (A * links.r * ((rlen / th_mat)**(links.net.POW - 2) / th_mat * tf.exp(-( (rlen / th_mat)**links.net.POW)) * links.link_self_mask_multi(idx))[:, :, newaxis]) # including selfrepulsion again # fmat = A*links.r*((rlen/th_mat)**(POW-2)/th_mat*tf.exp(-(rlen/th_mat)**POW))[:,:,newaxis] links.Force_LL = tf.reduce_sum(fmat, 0) return tf.scatter_add(links.net.f_link, idx, links.Force_LL)
def _resource_scatter_add(self, x, i, v): with tf.control_dependencies([tf.scatter_add(x.handle, i, v)]): return x.value()