def call(self, inputs, training=None, mask=None): batch_size = tf.shape(inputs)[0] W_3d = tf.tile(tf.expand_dims(self.W, axis=0), tf.stack([batch_size, 1, 1])) # [batch_size, steps, features] input_projection = tf.matmul(inputs, W_3d) if self.use_bias: input_projection += self.b input_projection = tf.tanh(input_projection) # [batch_size, steps, 1] similaritys = tf.reduce_sum(tf.multiply(input_projection, self.attention_context_vector), axis=2, keep_dims=True) # [batch_size, steps, 1] if mask is not None: attention_weights = masked_softmax(similaritys, mask, axis=1) else: attention_weights = tf.nn.softmax(similaritys, axis=1) # [batch_size, features] attention_output = tf.reduce_sum(tf.multiply(inputs, attention_weights), axis=1) return attention_output
def grad_variance(self): grad_var_ops = [] tensor_to_avg = [] for t, g in zip(self._tvars, self._grads): if isinstance(g, ops.IndexedSlices): tensor_to_avg.append( tf.reshape(tf.unsorted_segment_sum(g.values, g.indices, g.dense_shape[0]), shape=t.get_shape())) else: tensor_to_avg.append(g) avg_op = self._moving_averager.apply(tensor_to_avg) grad_var_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._grad_avg = [ self._moving_averager.average(val) for val in tensor_to_avg ] self._grad_avg_squared = [tf.square(val) for val in self._grad_avg] self._grad_var = tf.maximum( tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype), self._grad_norm_squared_avg - tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared])) if self._sparsity_debias: self._grad_var *= self._sparsity_avg return grad_var_ops
def focal_loss(logits, labels, alpha, gamma=2, name='focal_loss'): """ Focal loss for multi classification :param logits: A float32 tensor of shape [batch_size num_class]. :param labels: A int32 tensor of shape [batch_size, num_class] or [batch_size]. :param alpha: A 1D float32 tensor for focal loss alpha hyper-parameter :param gamma: A scalar for focal loss gamma hyper-parameter. Returns: A tensor of the same shape as `lables` """ if len(labels.shape) == 1: labels = tf.one_hot(labels, logits.shape[-1]) else: labels = labels labels = tf.to_float(labels) y_pred = tf.nn.softmax(logits, dim=-1) L = -labels * tf.log(y_pred) L *= alpha * ((1 - y_pred)**gamma) loss = tf.reduce_sum(L) if tf.executing_eagerly(): tf.contrib.summary.scalar(name, loss) else: tf.summary.scalar(name, loss) return loss
def split_one_doc_to_true_len_sens(doc_t, split_token, padding_token, max_doc_len, max_sen_len): """ Split a document to sentences with true sentence lengths. doc_t: [doc_word_len] out_t: [max_doc_len, max_sen_len] """ if len(doc_t.get_shape()) == 1: split_token_index = tf.squeeze(tf.where(tf.equal(doc_t, split_token)), axis=1) split_token_index.set_shape([None]) split_len_part_1 = split_token_index[:1] + 1 split_len_part_2 = split_token_index[1:] - split_token_index[:-1] split_lens = tf.concat([split_len_part_1, split_len_part_2], axis=0) split_lens = cut_or_padding(split_lens, max_doc_len, padding_token=padding_token) new_doc_len = tf.reduce_sum(split_lens) splited_sentences = tf.split(doc_t[:new_doc_len], split_lens) splited_sentences = [ cut_or_padding(s, max_sen_len) for s in splited_sentences ] out_t = tf.stack(splited_sentences) padding_tokens = tf.multiply(tf.ones_like(out_t, dtype=tf.int32), padding_token) out_t = tf.where(tf.equal(out_t, split_token), padding_tokens, out_t) return out_t raise ValueError("doc_t should be a tensor with rank 1.")
def attention(inputs, attention_size, time_major=False, return_alphas=False): """Attention layer.""" if isinstance(inputs, tuple): # In case of Bi-RNN, concatenate the forward and the backward RNN outputs. inputs = tf.concat(inputs, 2) if time_major: # (T,B,D) => (B,T,D) inputs = tf.transpose(inputs, [1, 0, 2]) time_size = inputs.shape[1].value # T value - time size of the RNN layer hidden_size = inputs.shape[ 2].value # D value - hidden size of the RNN layer # Trainable parameters W_omega = tf.get_variable(name='W_omega', initializer=tf.random_normal( [hidden_size, attention_size], stddev=0.1)) b_omega = tf.get_variable(name='b_omega', initializer=tf.random_normal([attention_size], stddev=0.1)) u_omega = tf.get_variable(name='u_omega', initializer=tf.random_normal([attention_size, 1], stddev=0.1)) # Applying fully connected layer with non-linear activation to each of the B*T timestamps; # the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega) #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega) # (B, T, D) dot (D, Atten) logging.info('attention inputs: {}'.format(inputs.shape)) inputs_reshaped = tf.reshape(inputs, [-1, hidden_size]) dot = tf.matmul(inputs_reshaped, W_omega) dot = tf.reshape(dot, [-1, time_size, attention_size]) v = tf.sigmoid(dot + b_omega) logging.info(f'attention vector: {v.shape}') # For each of the timestamps its vector of size A from `v` is reduced with `u` vector # (B, T, Atten) dot (Atten) #vu = tf.tensordot(v, u_omega, axes=1) # (B,T) shape v = tf.reshape(v, [-1, attention_size]) vu = tf.matmul(v, u_omega) # (B,T) shape vu = tf.squeeze(vu, axis=-1) vu = tf.reshape(vu, [-1, time_size]) logging.info(f'attention energe: {vu.shape}') alphas = tf.nn.softmax(vu) # (B,T) shape also # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape # [batch, time] -> [batch, time, 1] alphas = tf.expand_dims(alphas, -1) # [batch, time, dim] -> [batch, dim] output = tf.reduce_sum(inputs * alphas, 1) if not return_alphas: return output return output, alphas
def compute_doc_lens(sen_lens): """ Count how many sentences in a document. inputs: [..., time_steps] doc_lens: [...] """ x_binary = tf.cast(tf.cast(sen_lens, tf.bool), tf.int32) doc_lens = tf.reduce_sum(x_binary, axis=-1) return doc_lens
def masked_softmax(logits, mask, axis): """Compute softmax with input mask.""" e_logits = tf.exp(logits) masked_e = tf.multiply(e_logits, mask) sum_masked_e = tf.reduce_sum(masked_e, axis, keep_dims=True) ones = tf.ones_like(sum_masked_e) # pay attention to a situation that if len of mask is zero, # denominator should be set to 1 sum_masked_e_safe = tf.where(tf.equal(sum_masked_e, 0), ones, sum_masked_e) return masked_e / sum_masked_e_safe
def compute_sen_lens(inputs, padding_token=0): """ Count how many words in a sentence. inputs: [..., time_steps] sen_lens: [...] """ x_binary = tf.cast(tf.not_equal(inputs, padding_token), tf.int32) sen_lens = tf.reduce_sum(x_binary, axis=-1) ones = tf.ones_like(sen_lens) sen_lens = tf.where(tf.equal(sen_lens, utils.PAD_IDX), x=ones, y=sen_lens) return sen_lens
def compute_lens(inputs, max_len): """count sequence length. input: [batch_size, max_len] lens: [batch_size] """ x_binary = tf.cast(tf.cast(tf.reverse(inputs, axis=[1]), tf.bool), tf.int32) lens = max_len - tf.argmax(x_binary, axis=1, output_type=tf.int32) zeros = tf.zeros_like(lens, dtype=tf.int32) x_sum = tf.reduce_sum(inputs, axis=1) sen_lens = tf.where(tf.equal(x_sum, 0), zeros, lens) return sen_lens
def call(self, tensors): """Attention layer.""" left, right = tensors len_left = left.shape[1] len_right = right.shape[1] tensor_left = tf.expand_dims(left, axis=2) tensor_right = tf.expand_dims(right, axis=1) tensor_left = tf.tile(tensor_left, [1, 1, len_right, 1]) tensor_right = tf.tile(tensor_right, [1, len_left, 1, 1]) tensor_merged = tf.concat([tensor_left, tensor_right], axis=-1) middle_output = self.middle_layer(tensor_merged) attn_scores = self.attn(middle_output) attn_scores = tf.squeeze(attn_scores, axis=3) exp_attn_scores = tf.exp( attn_scores - tf.reduce_max(attn_scores, axis=-1, keepdims=True)) exp_sum = tf.reduce_sum(exp_attn_scores, axis=-1, keepdims=True) attention_weights = exp_attn_scores / exp_sum return tf.matmul(attention_weights, right)
def before_apply(self): self._moving_averager = tf.train.ExponentialMovingAverage( decay=self._beta, zero_debias=self._zero_debias) assert self._grads is not None and len(self._grads) > 0 before_apply_ops = [] # get per var g**2 and norm**2 self._grad_squared = [] self._grad_norm_squared = [] for v, g in zip(self._tvars, self._grads): if g is None: continue with ops.colocate_with(v): self._grad_squared.append(tf.square(g)) self._grad_norm_squared = [ tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared ] if self._sparsity_debias: avg_op_sparsity = self.grad_sparsity() before_apply_ops.append(avg_op_sparsity) # the following running average on squared norm of gradient is shared # by `grad_variance` and `dist_to_opt` avg_op = self._moving_averager.apply(self._grad_norm_squared) with tf.control_dependencies([avg_op]): self._grad_norm_squared_avg = [ self._moving_averager.average(val) for val in self._grad_norm_squared ] self._grad_norm_squared = tf.add_n(self._grad_norm_squared) self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg) before_apply_ops.append(avg_op) with tf.control_dependencies([avg_op]): curv_range_ops = self.curvature_range() before_apply_ops += curv_range_ops grad_var_ops = self.grad_variance() before_apply_ops += grad_var_ops dist_to_opt_ops = self.dist_to_opt() before_apply_ops += dist_to_opt_ops return tf.group(*before_apply_ops)
def exclude_padding(self, batch): x_binary = tf.cast(tf.not_equal(batch, utils.PAD_IDX), tf.int32) sen_lens = tf.reduce_sum(x_binary, axis=-1) return batch[:sen_lens]