def MoS(x, hidden_size, vocab_size, n_experts=10): ''' a Mixture-of-Softamx operator, as specified in arXiv:1711.03953 relatively untested. x - tensor, dtype = tf.float32 shape = [..., hidden_size] hidden_size - the final dimension of x vocab_size - the vocab_size to calculate n_experts - the number of experts, int ''' batch_size = tf.shape(x)[0] sequence_size = tf.shape(x)[1] with tf.variable_scope('latent'): latent = utils.dense(x, output_dim=n_experts * hidden_size, name='latent') latent = tf.nn.tanh(latent) with tf.variable_scope('decoder'): latent = tf.reshape(latent, [-1, hidden_size]) logit = utils.dense(latent, output_dim=vocab_size, name='decoder') with tf.variable_scope('prior'): prior_logit = utils.dense(x, output_dim=n_experts, name='prior') prior_logit = tf.reshape(prior_logit, [-1, n_experts]) prior = tf.nn.softmax(prior_logit, axis=-1) prob = tf.reshape( tf.nn.softmax(tf.reshape(logit, [-1, vocab_size]), axis=-1), [-1, n_experts, vocab_size]) prior = tf.expand_dims(prior, axis=2) prior = tf.tile(prior, [1, 1, vocab_size]) prob = (prob * prior) prob = tf.reduce_sum(prob, axis=1) prob = tf.reshape(prob, [batch_size, sequence_size, vocab_size]) return prob
def transformer_ffd(self, x): x = utils.dense(x, output_dim=self.arg.filter_size, use_bias=True, name='ffd_1') x = self.dropout_fn(x) if self.arg.use_relu: x = tf.nn.relu(x) else: x = utils.gelu(x) return utils.dense(x, output_dim=self.arg.hidden_size, use_bias=True, name='ffd_2')
def __call__(self, inputs, state, scope=None, *args, **kwargs): with tf.variable_scope('attention'): hidden_with_time_axis = tf.expand_dims(state, axis=1) score = utils.dense(tf.nn.tanh( utils.dense( self.encoder_output, output_dim=self.state_size, name='W1') + utils.dense(hidden_with_time_axis, output_dim=self.state_size, name='W2')), output_dim=1, name='V') attention_weights = tf.nn.softmax(score, axis=1) context_vector = attention_weights * self.encoder_output context_vector = tf.reduce_sum(context_vector, axis=1) inputs = tf.concat([inputs, context_vector], axis=1) return self.cell.__call__(inputs, state, scope=scope, *args, **kwargs)
def body(x, i, halting_probability, remainders, n_updates): with tf.variable_scope('decoder_layer'): state = x x += self.timing_position(x) pondering = utils.dense(x, output_dim=1, name='pondering') pondering = tf.squeeze(pondering, axis=-1) pondering = tf.nn.sigmoid(pondering) update_weights, halting_probability, remainders, n_updates = act( pondering, halt_threshold, halting_probability, remainders, n_updates) with tf.variable_scope('attention'): y = utils.layer_norm(x) y = utils.multihead_attention( query=y, memory=None, bias=decoder_self_attention_bias, total_key_depth=self.arg.head_size * self.arg.num_heads, total_value_depth=self.arg.head_size * self.arg.num_heads, output_depth=self.arg.hidden_size, num_heads=self.arg.num_heads, deparameterize=self.arg.deparameterize, dropout_keep_prob=self.keep_prob, dropout_type=self.arg.dropout_type, relative_attention=self.arg.relative_attention, max_relative_position=self.arg.max_relative_position) y = self.dropout_fn(y) x += y with tf.variable_scope('encoder_attention'): y = utils.layer_norm(x) y = utils.multihead_attention( query=y, memory=memory, bias=encoder_decoder_attention_bias, total_key_depth=self.arg.head_size * self.arg.num_heads, total_value_depth=self.arg.head_size * self.arg.num_heads, output_depth=self.arg.hidden_size, num_heads=self.arg.num_heads, dropout_keep_prob=self.keep_prob, dropout_type=self.arg.dropout_type, relative_attention=False, max_relative_position=self.arg.max_relative_position) y = self.dropout_fn(y) x += y with tf.variable_scope('ffd'): y = utils.layer_norm(x) y = self.ffd(y) y = self.dropout_fn(y) x += y x = (x * update_weights) + (state * (1 - update_weights)) return x, i + 1, halting_probability, remainders, n_updates
def SRU(x, num_layers=2, activation=None, initial_state=None, name=None, reuse=None, reuse_layer=False): ''' SRU introduced in arXiv:1709.02755 code based on tensor2tensor x - tensor, dtype = tf.float32 shape = [batch_size, sequence_size, hidden_size] ''' with tf.variable_scope(name, default_name='SRU', reuse=reuse): tf_x_shape = tf.shape(x) x_shape = x.shape.as_list() x = tf.transpose(x, perm=[1, 0, 2], name='input_transpose') if initial_state is None: initial_state = tf.zeros( [tf.shape(x)[1], tf.shape(x)[2]], dtype=x.dtype) for i in range(num_layers): with tf.variable_scope('layer_{}'.format(i + 1), reuse=(i != 0 and reuse_layer)): x_orig = x x, f, r = tf.split(utils.dense(x, output_dim=3 * x_shape[-1], name='dense'), num_or_size_splits=3, axis=-1) f, r = tf.sigmoid(f), tf.sigmoid(r) x_times_one_minus_f = x * (1.0 - f) c_states = tf.scan(next_state, (x_times_one_minus_f, f), initializer=initial_state, parallel_iterations=2, name='scan_{}'.format(i)) if activation is not None: c_states = activation(c_states) h = c_states * r + (1.0 - r) * x_orig x = h x = tf.transpose(x, perm=[1, 0, 2]) return tf.reshape(x, tf_x_shape)
def decoder(self, inputs, memory, decoder_self_attention_bias, encoder_decoder_attention_bias): x = inputs if self.arg.adaptive_mask: self.decoder_l0 = [] for layer in range(1, self.arg.decoder_layers + 1): with tf.variable_scope('layer_{}'.format(layer)): with tf.variable_scope('16_head_self_attention'): y = utils.layer_norm(x) left_state = utils.multihead_attention( query=y, memory=None, bias=self.decoder_self_attention_bias, total_key_depth=self.arg.head_size * max(min(self.arg.num_heads * 2, 16), self.arg.num_heads), total_value_depth=self.arg.head_size * max(min(self.arg.num_heads * 2, 16), self.arg.num_heads), output_depth=self.arg.hidden_size, num_heads=max(min(self.arg.num_heads * 2, 16), self.arg.num_heads), dropout_keep_prob=self.keep_prob, dropout_type=self.arg.dropout_type, name='self_attention', relative_attention=self.arg.relative_attention, max_relative_position=self.arg.max_relative_position, adaptive_mask=self.arg.adaptive_mask, dynamic_attention_span=self.arg.dynamic_attention_span) if self.arg.adaptive_mask: self.decoder_l0.append(left_state[1]) left_state = left_state[0] right_state = utils.multihead_attention( query=y, memory=memory, bias=self.encoder_decoder_attention_bias, total_key_depth=self.arg.head_size * self.arg.num_heads, total_value_depth=self.arg.head_size * self.arg.num_heads, output_depth=self.arg.hidden_size, num_heads=self.arg.num_heads, dropout_keep_prob=self.keep_prob, dropout_type=self.arg.dropout_type, name='encoder_attention', relative_attention=False, max_relative_position=self.arg.max_relative_position, adaptive_mask=self.arg.adaptive_mask, dynamic_attention_span=self.arg.dynamic_attention_span) if self.arg.adaptive_mask: self.decoder_l0.append(right_state[1]) right_state = right_state[0] x += self.dropout_fn(left_state) + self.dropout_fn( right_state) with tf.variable_scope('conv_branches'): y = utils.layer_norm(x) if self.arg.unidirectional_decoder: left_state = tf.concat([ tf.zeros( [self.batch_size, 10, self.arg.hidden_size]), y ], axis=1) padding = 'VALID' else: padding = 'SAME' left_state = y left_state = utils.separable_conv( left_state, filters=self.arg.hidden_size * 2, kernel_size=11, padding=padding, name='separable_11x1') if self.arg.use_relu: left_state = tf.nn.relu(left_state) else: left_state = utils.gelu(left_state) left_state = self.dropout_fn(left_state) if self.arg.unidirectional_decoder: right_state = tf.concat([ tf.zeros( [self.batch_size, 6, self.arg.hidden_size]), y ], axis=1) padding = 'VALID' else: padding = 'SAME' right_state = y right_state = utils.separable_conv( right_state, filters=int(self.arg.hidden_size / 2), kernel_size=7, padding=padding, name='separable_7x1') right_state = tf.pad( right_state, paddings=[[0, 0], [0, 0], [0, int(self.arg.hidden_size * 1.5)]], constant_values=0) y = left_state + right_state y = utils.layer_norm(y) if self.arg.unidirectional_decoder: y = tf.concat([ tf.zeros([ self.batch_size, 6, self.arg.hidden_size * 2 ]), y ], axis=1) padding = 'VALID' else: padding = 'SAME' y = utils.separable_conv(y, filters=self.arg.hidden_size, kernel_size=7, padding=padding, name='separable_7x1_2') x += self.dropout_fn(y) with tf.variable_scope('self_attention'): y = utils.layer_norm(x) y = utils.multihead_attention( query=y, memory=None, bias=self.decoder_self_attention_bias, total_key_depth=self.arg.head_size * self.arg.num_heads, total_value_depth=self.arg.head_size * self.arg.num_heads, output_depth=self.arg.hidden_size, num_heads=self.arg.num_heads, dropout_keep_prob=self.keep_prob, dropout_type=self.arg.dropout_type, relative_attention=self.arg.relative_attention, max_relative_position=self.arg.max_relative_position, adaptive_mask=self.arg.adaptive_mask, dynamic_attention_span=self.arg.dynamic_attention_span) if self.arg.adaptive_mask: self.decoder_l0.append(y[1]) y = y[0] x += self.dropout_fn(y) with tf.variable_scope('encoder_attention'): y = utils.layer_norm(x) y = utils.multihead_attention( query=y, memory=memory, bias=self.encoder_decoder_attention_bias, total_key_depth=self.arg.head_size * self.arg.num_heads, total_value_depth=self.arg.head_size * self.arg.num_heads, output_depth=self.arg.hidden_size, num_heads=self.arg.num_heads, dropout_keep_prob=self.keep_prob, dropout_type=self.arg.dropout_type, relative_attention=False, max_relative_position=self.arg.max_relative_position, adaptive_mask=self.arg.adaptive_mask, dynamic_attention_span=self.arg.dynamic_attention_span) if self.arg.adaptive_mask: self.decoder_l0.append(y[1]) y = y[0] x += self.dropout_fn(y) with tf.variable_scope('dense_layers'): y = utils.layer_norm(x) y = utils.dense(y, output_dim=self.arg.hidden_size * 4, name='dense_1') y = tf.nn.swish(y) y = utils.layer_norm(y) y = utils.dense(y, output_dim=self.arg.hidden_size, name='dense_2') x += self.dropout_fn(y) return utils.layer_norm(x)
def encoder(self, inputs, encoder_self_attention_bias): x = inputs if self.arg.adaptive_mask: self.encoder_l0 = [] for layer in range(1, self.arg.encoder_layers + 1): with tf.variable_scope('layer_{}'.format(layer)): with tf.variable_scope('gated_linear_unit'): y = utils.layer_norm(x) y = utils.convolution_gating( y, kernel_size=1, input_dim=y.shape.as_list()[-1], output_dim=y.shape.as_list()[-1]) y = self.dropout_fn(y) x += y with tf.variable_scope('conv_branches'): y = utils.layer_norm(x) if self.arg.use_relu: left_state = tf.nn.relu( utils.dense(y, output_dim=int(self.arg.hidden_size * 4), name='left_branch')) else: left_state = utils.gelu( utils.dense(y, output_dim=int(self.arg.hidden_size * 4), name='right_branch')) left_state = self.dropout_fn(left_state) with tf.variable_scope('right_branch'): kernel = tf.get_variable('kernel', shape=[ 3, y.shape.as_list()[-1], int(self.arg.hidden_size / 2) ], dtype=tf.float32) ''' given that the tensor, at this point, is of shape [batch_size, sequence_size, hidden_size], and the kernel size of the convolution is 3, then an unmoderated convolution, at time-step t, would analyze the time-steps (t-1, t, t+1) for the output of time-step t If the anaylsis is unidirectional, and that the analysis at time-step t cannot see 'ahead through time', this form of analysis if invalid. Therefore, in order to avoid this illegal analysis, a zeros vector is concatenated to the left of the tensor. Therefore, at time-step t, the time-steps (t-2, t-1, t) is analyzed, where the tokens at -2 and -1 are 0 If the analysis is bidirectional, analyzing the time-steps (t-1, t, t+1) is a legal move ''' if self.arg.unidirectional_encoder: padding = 'VALID' y = tf.concat([ tf.zeros([ self.batch_size, 2, self.arg.hidden_size ]), y ], axis=1) else: padding = 'SAME' right_state = tf.nn.convolution( y, kernel, padding=padding, name='convolution_conv_3x1') if self.arg.use_relu: right_state = tf.nn.relu(right_state) else: right_state = utils.gelu(right_state) right_state = self.dropout_fn(right_state) right_state = tf.pad(right_state, [[0, 0], [0, 0], [ 0, int(self.arg.hidden_size * 4) - int(self.arg.hidden_size / 2) ]], constant_values=0) y = left_state + right_state y = utils.layer_norm(y) if self.arg.unidirectional_encoder: padding = 'VALID' y = tf.concat([ tf.zeros([ self.batch_size, 8, self.arg.hidden_size * 4 ]), y ], axis=1) else: padding = 'SAME' y = utils.separable_conv(y, filters=int(self.arg.hidden_size / 2), kernel_size=9, padding=padding, name='separable_9x1') y = tf.pad( y, [[0, 0], [0, 0], [0, int(self.arg.hidden_size / 2)]], constant_values=0) x += self.dropout_fn(y) with tf.variable_scope('self_attention'): y = utils.layer_norm(x) y = utils.multihead_attention( query=y, memory=None, bias=self.encoder_self_attention_bias, total_key_depth=self.arg.head_size * self.arg.num_heads, total_value_depth=self.arg.head_size * self.arg.num_heads, output_depth=self.arg.hidden_size, num_heads=self.arg.num_heads, dropout_keep_prob=self.keep_prob, dropout_type=self.arg.dropout_type, relative_attention=self.arg.relative_attention, max_relative_position=self.arg.max_relative_position, adaptive_mask=self.arg.adaptive_mask, dynamic_attention_span=self.arg.dynamic_attention_span) if self.arg.adaptive_mask: self.encoder_l0.append(y[1]) y = y[0] x += self.dropout_fn(y) with tf.variable_scope('dense_layers'): y = utils.layer_norm(x) y = utils.dense(y, output_dim=int(self.arg.hidden_size * 4), name='dense_1') if self.arg.use_relu: y = tf.nn.relu(y) else: y = utils.gelu(y) y = self.dropout_fn(y) y = utils.dense(y, output_dim=int(self.arg.hidden_size), name='dense_2') x += self.dropout_fn(y) return utils.layer_norm(x)
def encoder(self, inputs, encoder_self_attention_bias): with tf.variable_scope('positional_embedding'): pos_seq = tf.range( tf.shape(self.encoder_self_attention_bias)[-1] - 1, -1, -1.0) inv_freq = 1 / (10000**(tf.range(0, self.arg.hidden_size, 2.0) / self.arg.hidden_size)) sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq) pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1) pos_emb = tf.tile(pos_emb[None, :, :], [self.batch_size, 1, 1]) if self.arg.tie_weights: r_w_bias = tf.get_variable( 'r_w_bias', shape=[1, self.arg.num_heads, 1, self.arg.head_size], dtype=tf.float32) r_r_bias = tf.get_variable( 'r_r_bias', shape=[1, self.arg.num_heads, 1, self.arg.head_size], dtype=tf.float32) else: r_w_bias = tf.get_variable('r_w_bias', shape=[ self.arg.encoder_layers, 1, self.arg.num_heads, 1, self.arg.head_size ], dtype=tf.float32) r_r_bias = tf.get_variable('r_r_bias', shape=[ self.arg.encoder_layers, 1, self.arg.num_heads, 1, self.arg.head_size ], dtype=tf.float32) x = inputs for layer in range(1, self.arg.encoder_layers + 1): with tf.variable_scope('layer_{}'.format(layer)): x = self.timing_position(x) with tf.variable_scope('attention'): self.new_mems.append( self._cache_mem(x, self.memory[layer - 1])) memory = tf.concat([self.memory[layer - 1], x], axis=1) y = utils.layer_norm(x) memory = utils.layer_norm(memory) q, k, v = utils.compute_qkv( query=y, memory=memory, total_key_depth=self.arg.head_size * self.arg.num_heads, total_value_depth=self.arg.head_size * self.arg.num_heads, deparameterize=self.arg.deparameterize) r = utils.dense(pos_emb, output_dim=self.arg.head_size * self.arg.num_heads, use_bias=False, name='pos_emb') r = tf.reshape(r, [ self.batch_size, self.arg.num_heads, -1, self.arg.head_size ]) q = utils.split_heads(q, self.arg.num_heads) k = utils.split_heads(k, self.arg.num_heads) v = utils.split_heads(v, self.arg.num_heads) if self.arg.tie_weights: AD = tf.matmul(q + r_w_bias, k, transpose_b=True) BD = tf.matmul(q + r_r_bias, r, transpose_b=True) else: AD = tf.matmul(q + r_w_bias[layer - 1], k, transpose_b=True) BD = tf.matmul(q + r_r_bias[layer - 1], r, transpose_b=True) BD = self.rel_shift(BD) logits = AD + BD logits /= k.shape.as_list()[-1] logits += self.encoder_self_attention_bias weights = tf.nn.softmax(logits, name='attention_weights') y = tf.matmul(weights, v) y = utils.combine_heads(y) y.set_shape(y.shape.as_list()[:-1] + [self.arg.head_size * self.arg.num_heads]) with tf.variable_scope('output'): y = utils.dense(y, output_dim=self.arg.hidden_size, use_bias=False, name='output_transform') y = self.dropout_fn(y) x += y with tf.variable_scope('ffd'): y = utils.layer_norm(x) y = self.ffd(y) y = self.dropout_fn(y) x += y with tf.variable_scope('output'): return utils.layer_norm(x)
def __init__(self, arg, name=None): ''' a Seq2Seq model based on the model described in arXiv:1804.00946 the stop-feature mechanism, in particular, was taken from these mechanisms ''' if name: self.name = name else: self.name = 'Seq2Seq' batch_size = 32 input_sequence_size = 10 output_sequence_size = 12 if __name__ != '__main__': batch_size = input_sequence_size = output_sequence_size = None self.arg = arg self.inputs = tf.placeholder(tf.int32, shape=[batch_size, input_sequence_size], name='inputs') self.targets = tf.placeholder(tf.int32, shape=[batch_size, output_sequence_size], name='targets') self.training = tf.placeholder(tf.bool, name='training') self.learning_rate = tf.placeholder(tf.float32, name='learning_rate') self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') self.input_stop_feature = tf.placeholder( tf.float32, shape=[batch_size, input_sequence_size, 1], name='input_stop_feature') self.target_stop_feature = tf.placeholder( tf.float32, shape=[batch_size, output_sequence_size, 1], name='target_stop_feature') self.batch_size = tf.shape(self.inputs)[0] self.input_sequence_size = tf.shape(self.inputs)[1] self.target_sequence_size = tf.shape(self.targets)[1] if self.arg.mask_loss: self.loss_mask = tf.placeholder( tf.float32, shape=[batch_size, output_sequence_size ], # (batch_size, output_sequence_size) name='loss_mask') else: self.loss_mask = None with tf.variable_scope('embedding'): embedded_inputs, embedded_targets = self.embedding() embedded_inputs = tf.concat( [embedded_inputs, self.input_stop_feature], axis=2) embedded_targets = tf.concat( [embedded_targets, self.target_stop_feature], axis=2) with tf.variable_scope('encode'): encoder_output, encoder_state = self.encode(embedded_inputs) encoder_output = self.dropout_fn(encoder_output) with tf.variable_scope('decode'): decoder_output, _ = self.decode(encoder_output, encoder_state, embedded_targets) decoder_output = self.dropout_fn(decoder_output) with tf.variable_scope('output'): self.logits = utils.dense(decoder_output, output_dim=self.arg.target_vocab_size, name='logits') with tf.variable_scope('loss'): self.loss_cl = loss.Loss(self.logits, self.targets, self.arg.loss, vocab_size=self.arg.target_vocab_size, label_smoothing=self.arg.label_smoothing) cost = self.loss_cl.loss if self.arg.mask_loss: self.cost = tf.reduce_mean(cost * self.loss_mask) else: self.cost = tf.reduce_mean(cost) if self.arg.weight_decay_regularization: l2_loss = self.loss_cl.l2_loss(tf.trainable_variables()) l2_loss *= self.arg.weight_decay_hyperparameter self.cost += l2_loss self.optimizer = optimize.Optimizer( arg, loss=self.cost, learning_rate=self.learning_rate) self.optimizer.accuracy(self.logits, self.targets, mask=self.loss_mask) self.train_op = self.optimizer.train_op self.predict = self.optimizer.predict self.correct_prediction = self.optimizer.correct_prediction self.accuracy = self.optimizer.accuracy self.optimizer.sequential_accuracy(self.logits, self.targets, mask=self.loss_mask) self.sequential_accuracy = self.optimizer.sequential_accuracy self.fetches = [embedded_inputs, encoder_output, self.logits]