def curvature_range(self): # set up the curvature window self._curv_win = tf.Variable(np.zeros([ self._curv_win_width, ]), dtype=tf.float32, name="curv_win", trainable=False) # we can use log smoothing for curvature range to follow trend faster # self._curv_win = tf.scatter_update( # self._curv_win, self._global_step % self._curv_win_width, # tf.log(self._grad_norm_squared + EPS)) self._curv_win = tf.scatter_update( self._curv_win, self._global_step % self._curv_win_width, self._grad_norm_squared + EPS) # note here the iterations start from iteration 0 valid_window = tf.slice( self._curv_win, tf.constant([ 0, ]), tf.expand_dims(tf.minimum(tf.constant(self._curv_win_width), self._global_step + 1), dim=0)) if self._h_min_log_smooth: self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS) else: self._h_min_t = tf.reduce_min(valid_window) if self._h_max_log_smooth: self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS) else: self._h_max_t = tf.reduce_max(valid_window) curv_range_ops = [] with tf.control_dependencies([self._h_min_t, self._h_max_t]): avg_op = self._moving_averager.apply( [self._h_min_t, self._h_max_t]) with tf.control_dependencies([avg_op]): if self._h_min_log_smooth: self._h_min = tf.exp( tf.identity( self._moving_averager.average(self._h_min_t))) else: self._h_min = \ tf.identity(self._moving_averager.average(self._h_min_t)) if self._h_max_log_smooth: self._h_max = tf.exp( tf.identity( self._moving_averager.average(self._h_max_t))) else: self._h_max = \ tf.identity(self._moving_averager.average(self._h_max_t)) if self._sparsity_debias: self._h_min = self._h_min * self._sparsity_avg self._h_max = self._h_max * self._sparsity_avg curv_range_ops.append(avg_op) return curv_range_ops
def masked_softmax(logits, mask, axis): """Compute softmax with input mask.""" e_logits = tf.exp(logits) masked_e = tf.multiply(e_logits, mask) sum_masked_e = tf.reduce_sum(masked_e, axis, keep_dims=True) ones = tf.ones_like(sum_masked_e) # pay attention to a situation that if len of mask is zero, # denominator should be set to 1 sum_masked_e_safe = tf.where(tf.equal(sum_masked_e, 0), ones, sum_masked_e) return masked_e / sum_masked_e_safe
def call(self, tensors): """Attention layer.""" left, right = tensors len_left = left.shape[1] len_right = right.shape[1] tensor_left = tf.expand_dims(left, axis=2) tensor_right = tf.expand_dims(right, axis=1) tensor_left = tf.tile(tensor_left, [1, 1, len_right, 1]) tensor_right = tf.tile(tensor_right, [1, len_left, 1, 1]) tensor_merged = tf.concat([tensor_left, tensor_right], axis=-1) middle_output = self.middle_layer(tensor_merged) attn_scores = self.attn(middle_output) attn_scores = tf.squeeze(attn_scores, axis=3) exp_attn_scores = tf.exp( attn_scores - tf.reduce_max(attn_scores, axis=-1, keepdims=True)) exp_sum = tf.reduce_sum(exp_attn_scores, axis=-1, keepdims=True) attention_weights = exp_attn_scores / exp_sum return tf.matmul(attention_weights, right)