def setup_but_because(self): # For Erin: this is the MODEL!!! # seqA: but, seqB: because, this will learn to differentiate them seqA_w_matrix, seqA_c_vec = self.encoder.encode( self.seqA_inputs, self.seqA_mask, temp_max=FLAGS.temp_max) seqB_w_matrix, seqB_c_vec = self.encoder.encode( self.seqB_inputs, self.seqB_mask, reuse=True, temp_max=FLAGS.temp_max) self.seqA_rep = seqA_c_vec self.seqB_rep = seqB_c_vec # for now we just use context vector # we create additional perspectives # seqA_c_vec: (batch_size, hidden_size) persA_B_mul = seqA_c_vec * seqB_c_vec persA_B_sub = seqA_c_vec - seqB_c_vec persA_B_avg = (seqA_c_vec + seqB_c_vec) / 2.0 # logits is [batch_size, label_size] if FLAGS.abs: persA_B_sub = tf.abs(seqA_c_vec - seqB_c_vec) self.logits = rnn_cell._linear( [seqA_c_vec, seqB_c_vec, persA_B_mul, persA_B_sub], self.label_size, bias=True) else: self.logits = rnn_cell._linear([ seqA_c_vec, seqB_c_vec, persA_B_mul, persA_B_sub, persA_B_avg ], self.label_size, bias=True)
def build_model(self): target_v = tf.placeholder(tf.float32, [None]) conversation_d = tf.placeholder(tf.int32, [None, None]) conversation_g = tf.placeholder(tf.int32, [None, None]) conversation_len = tf.placeholder(tf.int32, [None]) last_description = tf.placeholder(tf.int32, [None]) target_word = tf.placeholder(tf.int32, [None]) i_know = tf.placeholder(tf.int32, [None]) word_embeddings = tf.get_variable('word_embed', initializer=tf.convert_to_tensor( self.data_loader.embeddings, dtype=tf.float32), trainable=False) conv_d = tf.nn.embedding_lookup(word_embeddings, conversation_d) conv_g = tf.nn.embedding_lookup(word_embeddings, conversation_g) know = tf.nn.embedding_lookup(word_embeddings, i_know) conv = tf.concat(2, [conv_g, conv_d]) with tf.variable_scope('guesser'): last_des = tf.nn.embedding_lookup(word_embeddings, last_description) gue_cell = rnn_cell.GRUCell(self.hidden_units) _, gue_state = rnn.dynamic_rnn(gue_cell, conv, conversation_len, dtype=tf.float32) gue_repr = tf.tanh( rnn_cell._linear([gue_state, last_des], self.final_units, True)) gue_core = tf.get_variable('gue_core', [self.final_units, self.embedding_size]) gue_ready = tf.matmul(gue_repr, gue_core) guesser_value = tf.reduce_sum(tf.mul(gue_ready, know), 1) gue_pred = tf.matmul(gue_ready, word_embeddings, transpose_b=True) _guess_ = tf.nn.top_k(gue_pred, self.vocab_size) with tf.variable_scope('describer'): target = tf.nn.embedding_lookup(word_embeddings, target_word) des_cell = Contextual_GRUCell(self.hidden_units) _, des_state = contextual_rnn(des_cell, conv, target, conversation_len, dtype=tf.float32) des_repr = tf.tanh( rnn_cell._linear([des_state, target], self.final_units, True)) des_core = tf.get_variable('des_core', [self.final_units, self.embedding_size]) des_ready = tf.matmul(des_repr, des_core) describer_value = tf.reduce_sum(tf.mul(des_ready, know), 1) des_pred = tf.matmul(des_ready, word_embeddings, transpose_b=True) _description_ = tf.nn.top_k(des_pred, self.vocab_size) optimizer = tf.train.GradientDescentOptimizer(self.step_size) update_guesser_op = optimizer.minimize( tf.reduce_sum(tf.square(target_v - guesser_value))) update_describler_op = optimizer.minimize( tf.reduce_sum(tf.square(target_v - describer_value))) return update_guesser_op, update_describler_op, guesser_value, describer_value, target_v, i_know, conversation_d, conversation_g, conversation_len, last_description, target_word, _guess_, _description_
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope(scope or type(self).__name__): if self._dropMaskInput.get_shape()[1:] != inputs.get_shape()[1:]: print("error: "+str(self._dropMaskInput.get_shape()[1:])+" != "+str(inputs.get_shape()[1:])) assert(False) if self._dropMaskState.get_shape()[1:] != state.get_shape()[1:]: print("error: "+str(self._dropMaskState.get_shape()[1:])+" != "+str(state.get_shape()[1:])) assert(False) dropin = tf.mul(self._dropMaskInput, inputs) dropst = tf.mul(self._dropMaskState, state) with vs.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. concat = rnn_cell._linear([dropin, dropst], 2 * self._num_units, True, 1.0) r, u = tf.split(1, 2, concat) r, u = tf.sigmoid(r), tf.sigmoid(u) with vs.variable_scope("Candidate"): htilda = self._activation(rnn_cell._linear([dropin, r * dropst], self._num_units, True)) new_h = u * dropst + (1 - u) * htilda return new_h, new_h
def __call__(self, inputs, state, scope=None): gru_out, gru_state = super(GRUAttnCell, self).__call__(inputs, state, scope) with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn"): ht = rnn_cell._linear(gru_out, self._num_units, True, 1.0) ht = tf.expand_dims(ht, axis=1) scores = tf.reduce_sum(self.hs * ht, reduction_indices=2, keep_dims=True) # New stuff scores = tf.exp( scores - tf.reduce_max(scores, reduction_indices=1, keep_dims=True)) scores = scores / (1e-6 + tf.reduce_sum( scores, reduction_indices=1, keep_dims=True)) context = tf.reduce_sum(self.hs * scores, reduction_indices=1) with vs.variable_scope("AttnConcat"): out = tf.nn.relu( rnn_cell._linear([context, gru_out], self._num_units, True, 1.0)) return out, out
def hyper_norm(self, layer, dimensions, scope="hyper"): with tf.variable_scope(scope): zw = rnn_cell._linear(self.hyper_output, self.hyper_embedding_size, False, scope=scope+ "z") alpha = rnn_cell._linear(zw, dimensions, False, scope=scope+ "alpha") result = tf.mul(alpha, layer) return result
def attention(input_t, output_t_minus_1, time): with tf.variable_scope('attention'): VxS = tf.reshape(rnn_cell._linear(output_t_minus_1, self.attention_judge_size, True), [-1, 1, 1, self.attention_judge_size]) # batch_size x 1 x 1 x attention _exp = tf.exp(tf.reduce_sum(attention_V * tf.tanh(WxH + VxS), [3])) # batch_size x source_len x 1 _exp = _exp * tf.expand_dims(self.mask, -1) attention_weight = _exp / tf.reduce_sum(_exp, [1], keep_dims=True) attention_t = tf.reduce_sum(encoder_outputs * attention_weight, [1]) feed_in_t = tf.tanh(rnn_cell._linear([attention_t, input_t], self.embedding_size, True)) return feed_in_t
def __call__(self, inputs, state, scope=None): gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope) with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn2"): gamma_h = tf.nn.tanh(rnn_cell._linear(gru_out, self._num_units, True, 1.0)) weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True) weights = tf.nn.softmax(weights, dim=1) context = tf.reduce_sum(self.hs * weights, reduction_indices=1) with vs.variable_scope("AttnConcat"): out = tf.nn.relu(rnn_cell._linear([context, gru_out], self._num_units, True, 1.0)) return (out, out)
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope(scope or type(self).__name__): # "GRUCell" with vs.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. r, u = array_ops.split(1, 2, rnn_cell._linear([inputs, state], 2 * self._num_units, True, 1.0)) r, u = tf.sigmoid(r), tf.sigmoid(u) with vs.variable_scope("Candidate"): c = self._activation(rnn_cell._linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, scope=None): gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope) with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn2"): gamma_h = tanh(rnn_cell._linear(gru_out, self._num_units, True, 1.0)) weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True) weights = tf.exp(weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True)) weights = weights / (1e-6 + tf.reduce_sum(weights, reduction_indices=0, keep_dims=True)) context = tf.reduce_sum(self.hs * weights, reduction_indices=0) with vs.variable_scope("AttnConcat"): out = tf.nn.relu(rnn_cell._linear([context, gru_out], self._num_units, True, 1.0)) self.attn_map = tf.squeeze(tf.slice(weights, [0, 0, 0], [-1, -1, 1])) return (out, out)
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM) with hypernetworks and layer normalization.""" with vs.variable_scope(scope or type(self).__name__): # Parameters of gates are concatenated into one multiply for efficiency. total_h, total_c = tf.split(1, 2, state) h = total_h[:, 0:self._num_units] c = total_c[:, 0:self._num_units] self.hyper_state = tf.concat(1, [total_h[:, self._num_units:], total_c[:, self._num_units:]]) hyper_input = tf.concat(1, [inputs, h]) hyper_output, hyper_new_state = self.hyper_cell(hyper_input, self.hyper_state) self.hyper_output = hyper_output self.hyper_state = hyper_new_state input_below_ = rnn_cell._linear([inputs], 4 * self._num_units, False, scope="out_1") input_below_ = self.hyper_norm(input_below_, 4 * self._num_units, scope="hyper_x") state_below_ = rnn_cell._linear([h], 4 * self._num_units, False, scope="out_2") state_below_ = self.hyper_norm(state_below_, 4 * self._num_units, scope="hyper_h") if self.is_layer_norm: s1 = vs.get_variable("s1", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32) s2 = vs.get_variable("s2", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32) s3 = vs.get_variable("s3", initializer=tf.ones([self._num_units]), dtype=tf.float32) b1 = vs.get_variable("b1", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32) b2 = vs.get_variable("b2", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32) b3 = vs.get_variable("b3", initializer=tf.zeros([self._num_units]), dtype=tf.float32) input_below_ = ln(input_below_, s1, b1) state_below_ = ln(state_below_, s2, b2) lstm_matrix = tf.add(input_below_, state_below_) i, j, f, o = array_ops.split(1, 4, lstm_matrix) new_c = (c * sigmoid(f) + sigmoid(i) * self._activation(j)) # Currently normalizing c causes lot of nan's in the model, thus commenting it out for now. # new_c_ = ln(new_c, s3, b3) new_c_ = new_c new_h = self._activation(new_c_) * sigmoid(o) hyper_h, hyper_c = tf.split(1, 2, hyper_new_state) new_total_h = tf.concat(1, [new_h, hyper_h]) new_total_c = tf.concat(1, [new_c, hyper_c]) new_total_state = tf.concat(1, [new_total_h, new_total_c]) return new_h, new_total_state
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" dim = self._num_units with vs.variable_scope(scope or type(self).__name__): # "GRUCell" with vs.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. with vs.variable_scope( "Layer_Parameters"): s1 = vs.get_variable("s1", initializer=tf.ones([2*dim]), dtype=tf.float32) s2 = vs.get_variable("s2", initializer=tf.ones([2*dim]), dtype=tf.float32) s3 = vs.get_variable("s3", initializer=tf.ones([dim]), dtype=tf.float32) s4 = vs.get_variable("s4", initializer=tf.ones([dim]), dtype=tf.float32) b1 = vs.get_variable("b1", initializer=tf.zeros([2*dim]), dtype=tf.float32) b2 = vs.get_variable("b2", initializer=tf.zeros([2*dim]), dtype=tf.float32) b3 = vs.get_variable("b3", initializer=tf.zeros([dim]), dtype=tf.float32) b4 = vs.get_variable("b4", initializer=tf.zeros([dim]), dtype=tf.float32) # Code below initialized for all cells # s1 = tf.Variable(tf.ones([2 * dim]), name="s1") # s2 = tf.Variable(tf.ones([2 * dim]), name="s2") # s3 = tf.Variable(tf.ones([dim]), name="s3") # s4 = tf.Variable(tf.ones([dim]), name="s4") # b1 = tf.Variable(tf.zeros([2 * dim]), name="b1") # b2 = tf.Variable(tf.zeros([2 * dim]), name="b2") # b3 = tf.Variable(tf.zeros([dim]), name="b3") # b4 = tf.Variable(tf.zeros([dim]), name="b4") input_below_ = rnn_cell._linear([inputs], 2 * self._num_units, False, scope="out_1") input_below_ = ln(input_below_, s1, b1) state_below_ = rnn_cell._linear([state], 2 * self._num_units, False, scope="out_2") state_below_ = ln(state_below_, s2, b2) out =tf.add(input_below_, state_below_) r, u = array_ops.split(1, 2, out) r, u = sigmoid(r), sigmoid(u) with vs.variable_scope("Candidate"): input_below_x = rnn_cell._linear([inputs], self._num_units, False, scope="out_3") input_below_x = ln(input_below_x, s3, b3) state_below_x = rnn_cell._linear([state], self._num_units, False, scope="out_4") state_below_x = ln(state_below_x, s4, b4) c_pre = tf.add(input_below_x,r * state_below_x) c = self._activation(c_pre) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, context, state, scope=None): """Contextual Gated recurrent unit (CGRU) with nunits cells.""" with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Gates"): r, u = array_ops.split( 1, 2, rnn_cell._linear([inputs, context, state], 2 * self._num_units, True, 1.0)) r, u = sigmoid(r), sigmoid(u) with vs.variable_scope("Candidate"): c = self._activation( rnn_cell._linear([inputs, context, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(1, 2, state) s1 = vs.get_variable("s1", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32) s2 = vs.get_variable("s2", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32) s3 = vs.get_variable("s3", initializer=tf.ones([self._num_units]), dtype=tf.float32) b1 = vs.get_variable("b1", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32) b2 = vs.get_variable("b2", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32) b3 = vs.get_variable("b3", initializer=tf.zeros([self._num_units]), dtype=tf.float32) # s1 = tf.Variable(tf.ones([4 * self._num_units]), name="s1") # s2 = tf.Variable(tf.ones([4 * self._num_units]), name="s2") # s3 = tf.Variable(tf.ones([self._num_units]), name="s3") # # b1 = tf.Variable(tf.zeros([4 * self._num_units]), name="b1") # b2 = tf.Variable(tf.zeros([4 * self._num_units]), name="b2") # b3 = tf.Variable(tf.zeros([self._num_units]), name="b3") input_below_ = rnn_cell._linear([inputs], 4 * self._num_units, False, scope="out_1") input_below_ = ln(input_below_, s1, b1) state_below_ = rnn_cell._linear([h], 4 * self._num_units, False, scope="out_2") state_below_ = ln(state_below_, s2, b2) lstm_matrix = tf.add(input_below_, state_below_) i, j, f, o = array_ops.split(1, 4, lstm_matrix) new_c = (c * sigmoid(f) + sigmoid(i) * self._activation(j)) # Currently normalizing c causes lot of nan's in the model, thus commenting it out for now. # new_c_ = ln(new_c, s3, b3) new_c_ = new_c new_h = self._activation(new_c_) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat(1, [new_c, new_h]) return new_h, new_state
def attention(query, use_attention=False): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) if use_attention is False: # apply mean pooling weights = tf.tile(sequence_length, tf.pack([attn_length])) weights = array_ops.reshape(weights, tf.shape(s)) a = array_ops.ones( tf.shape(s), dtype=dtype) / math_ops.to_float(weights) # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1]) else: a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds
def attention(query, use_attention=False): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) if use_attention is False: # apply mean pooling weights = tf.tile(sequence_length, tf.pack([attn_length])) weights = array_ops.reshape(weights, tf.shape(s)) a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights) # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1]) else: a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, is_train=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] if input_keep_prob < 1.0: assert is_train is not None flat_args = [ tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg) for arg in flat_args ] flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start, scope=scope) out = reconstruct(flat_out, args[0], 1) if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1]) if wd: add_wd(wd) return out
def __call__(self, inputs, state, scope=None): """Most basic RNN: output = new_state = activation(W * input + U * state + B + noise).""" with vs.variable_scope(scope or type(self).__name__): # "BasicRNNCell" z = rnn_cell._linear([inputs, state], self._num_units, True) z += random_normal(shape(z), stddev=self._stddev) output = self._activation(z) return output, output
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope(scope or type(self).__name__): # "GRUCell" with vs.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. r, u = array_ops.split( 1, 2, rnn_cell._linear([inputs, state], 2 * self._num_units, True, 1.0)) r, u = tf.sigmoid(r), tf.sigmoid(u) with vs.variable_scope("Candidate"): c = self._activation( rnn_cell._linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, d_act, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: try: c, h = array_ops.split(1, 2, state) except: c, h = array_ops.split(state, 2, 1) concat = _linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate try: i, j, f, o = array_ops.split(1, 4, concat) except: i, j, f, o = array_ops.split(concat, 4, 1) w_d = vs.get_variable('w_d', [self.key_words_voc_size, self._num_units]) new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j)) + tf.tanh(tf.matmul(d_act, w_d)) new_h = self._activation(new_c) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: try: new_state = array_ops.concat(1, [new_c, new_h]) except: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def _multi_head(self, queries, keys, query_mask, key_mask, num_heads, block_feature=False, scope='multihead', reuse=None): with vs.variable_scope(scope, reuse=reuse): # batch_size * seq_size_q * num_units Q = rnn_cell._linear(tf.reshape(queries, [-1, self.num_units]), self.num_units, True, 1.0, scope='Q') Q = tf.reshape(Q, tf.shape(queries)) # batch_size * seq_size_k * num_units K = rnn_cell._linear(tf.reshape(keys, [-1, self.num_units]), self.num_units, True, 1.0, scope='K') K = tf.reshape(K, tf.shape(keys)) V = rnn_cell._linear(tf.reshape(keys, [-1, self.num_units]), self.num_units, True, 1.0, scope='V') V = tf.reshape(V, tf.shape(keys)) Q_ = tf.pack(tf.split(2, num_heads, Q)) # num_heads * batch_size * seq_size_q *num_units/num_heads K_ = tf.pack(tf.split(2, num_heads, K)) # num_heads * batch_size * seq_size_k * num_units/num_heads V_ = tf.pack(tf.split(2, num_heads, V)) # num_heads * batch_size * seq_size_k * num_units/num_heads len_q = tf.shape(queries)[1] len_k = tf.shape(keys)[1] # Compute weight weights = tf.batch_matmul(Q_, tf.transpose(K_, [0,1,3,2])) \ / ((self.num_units/num_heads) ** 0.5) # num_heads * batch_size * seq_size_q * seq_size_k key_mask = tf.tile(tf.reshape(key_mask, [1, -1, 1, len_k]), [num_heads, 1, len_q, 1]) weights = tf.select(key_mask, weights, tf.ones_like(weights) * (-2**32 + 1)) if block_feature: diag_vals = tf.ones_like(weights[0, 0, :, :]) # seq_size_q * seq_size_k mask = tf.cast(tf.batch_matrix_band_part(diag_vals, -1, 0), tf.bool) mask = tf.tile(tf.reshape(mask, [1, 1, len_q, len_k]), [num_heads, tf.shape(queries)[0], 1, 1]) weights = tf.select(mask, weights, tf.ones_like(weights) * (-2 ** 32 + 1)) weights = tf.reshape(tf.nn.softmax(tf.reshape(weights, [-1, len_k])), [num_heads, -1, len_q, len_k]) # num_heads * batch_size * seq_size_q * num_units/num_heads ctx = tf.batch_matmul(weights, V_) ctx *= tf.reshape(tf.cast(query_mask, tf.float32), [-1, len_q, 1]) # num_heads * batch_size * seq_size_q * num_units/num_heads ctx = tf.concat(2, tf.unpack(ctx)) # batch_size * seq_size_q * num_units ctx = rnn_cell._linear(tf.reshape(ctx, [-1, self.num_units]), self.num_units, True, 1.0, scope='context') ctx = tf.reshape(ctx, [-1, len_q, self.num_units]) drop_ctx = tf.nn.dropout(ctx, keep_prob=self.keep_prob) # Add and Normalization res = layer_normalization(drop_ctx + queries) return res, weights
def cross_attention_rnn(config, cell, inputs, padding_mask, xvector): """ Input a list of tensors and get back the embedded vector for this list. NOTE: the difference from this function to the above one is that this takes vector from another source into consideration when calculating attention weights. See Tan et al., 2015 "Lstm-based deep learning models for non-factoid answer selection" for details. """ num_steps = len(inputs) hidden_size = cell.output_size * 2 batch_size = inputs[0].get_shape()[0].value embed_size = inputs[0].get_shape()[1].value assert(cell.output_size == config.rnn_hidden_size) assert(batch_size == config.batch_size) assert(embed_size == config.word_embed_size) with tf.variable_scope("attention_RNN"): input_length = tf.reduce_sum(tf.pack(padding_mask, axis=1), 1) # input_length = tf.Print(input_length, [padding_mask, input_length], # message='input length', summarize=50) outputs, state_fw, state_bw = \ tf.nn.bidirectional_rnn(cell, cell, inputs, dtype=config.data_type, sequence_length=input_length) # RESHAPE THE OUTPUTS, JUST IN CASE NONE DIM shaped_outputs = [tf.reshape(o, [batch_size, hidden_size]) for o in outputs] outputs = shaped_outputs outputs_for_attention = [tf.concat(1, [o, xvector]) # [batch_size, 2*hidden_size] for o in outputs] # OVERALL SEQUENCE REPRESENTAION hidden_outputs = [] attention_weights = [] outputs_concat = tf.pack(outputs, axis=1) # [batch_size, num_step, hidden_size] with tf.variable_scope("attention_computation"): context_vector = tf.get_variable("context_vector", [2*hidden_size, 1]) # Calculate attention attention_weights = [] for i in xrange(len(outputs)): if i > 0: tf.get_variable_scope().reuse_variables() hidden_output = tf.tanh(rnn_cell._linear(outputs_for_attention[i], 2*hidden_size, True # If add bias )) hidden_outputs.append(hidden_output) attention_weights.append(tf.matmul(hidden_output, context_vector)) # [batch_size, 1] attention_weights = tf.concat(1, attention_weights) attention_weights = tf.nn.softmax(attention_weights) * \ tf.pack(padding_mask, axis=1) # [batch_size, num_steps] attention_weights = tf.div(attention_weights, 1e-12 + tf.reduce_sum(attention_weights, 1, keep_dims=True)) # Attention weighted sum weighted_sum = tf.reduce_sum(outputs_concat * tf.expand_dims(attention_weights, 2), 1) # [batch_size, hidden_size] return weighted_sum, outputs_concat, hidden_outputs, attention_weights
def __init__(self, num_units, encoder_output, scope=None): self.hs = encoder_output with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn1"): hs2d = tf.reshape(self.hs, [-1, num_units]) phi_hs2d = tanh(rnn_cell._linear(hs2d, num_units, True, 1.0)) self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs)) super(GRUCellAttn, self).__init__(num_units)
def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y), [2, 3]) return s
def decode(self, h_q, h_p): """ takes in a knowledge representation and output a probability estimation over all paragraph tokens on which token should be the start of the answer span, and which should be the end of the answer span. :param knowledge_rep: it is a representation of the paragraph and question, decided by how you choose to implement the encoder :return: """ with vs.variable_scope("answer_start"): a_s = rnn_cell._linear([h_q, h_p], self.output_size, True, 1.0) with vs.variable_scope("answer_end"): a_e = rnn_cell._linear([h_q, h_p], self.output_size, True, 1.0) return (a_s, a_e)
def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) return s
def setup_actor_update(actor): with tf.variable_scope("rl"): actor.critic_output = tf.placeholder(tf.float32, [None, None, actor.vocab_size], name='critic_output') # action_gradients is passed in by Q_network... # and in DDPG, it's the gradients of Q w.r.t. policy's chosen actions # but in AC, it's the output of Q network w.r.t. all actions opt = nlc_model.get_optimizer(FLAGS.optimizer)(actor.learning_rate) # update params = tf.trainable_variables() # TODO: hope this would work with tf.variable_scope("Loss"): doshape = tf.shape(actor.decoder_output) T, batch_size = doshape[0], doshape[1] do2d = tf.reshape(actor.decoder_output, [-1, actor.size]) logits2d = rnn_cell._linear(do2d, actor.vocab_size, True, 1.0) # outputs2d = tf.nn.log_softmax(logits2d) # apply Q-network's score here (similar to advantage function) # 1. reshape critic_output like decoder_output (same shape anyway) # TODO: hope this is correct critic_do2d = tf.reshape(actor.critic_output, [-1, actor.vocab_size]) # should reshape according to critic # 2. multiply this with actor's logitis rl_logits2d = logits2d * critic_do2d # actor.outputs = tf.reshape(outputs2d, tf.pack([T, batch_size, actor.vocab_size])) targets_no_GO = tf.slice(actor.target_tokens, [1, 0], [-1, -1]) masks_no_GO = tf.slice(actor.target_mask, [1, 0], [-1, -1]) # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1]) mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1]) losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(rl_logits2d, labels1d) * tf.to_float(mask1d) losses2d = tf.reshape(losses1d, tf.pack([T, batch_size])) actor.rl_losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size) # http://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html (DDPG update) gradients = tf.gradients(actor.rl_losses, params) # step 7: update # Not sure if I understood this part lol clipped_gradients, _ = tf.clip_by_global_norm(gradients, FLAGS.max_gradient_norm) # clip, then multiply, otherwise we are not learning the signals from critic # clipped_gradients: [T, batch_size, vocab_size] # updated_gradients = clipped_gradients * actor.critic_output # pass in as input actor.rl_gradient_norm = tf.global_norm(clipped_gradients) actor.rl_param_norm = tf.global_norm(params) actor.rl_updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=actor.global_step)
def beam_step(time, beam_probs, beam_seqs, cand_probs, cand_seqs, *states): batch_size = tf.shape(beam_probs)[0] inputs = tf.reshape( tf.slice(beam_seqs, [0, time], [batch_size, 1]), [batch_size]) decoder_input = embedding_ops.embedding_lookup( self.L_pred, inputs) # self.L_env decoder_output, state_output = self.decoder_graph( decoder_input, states) with vs.variable_scope("Logistic", reuse=True): do2d = tf.reshape(decoder_output, [-1, self.size]) logits2d = rnn_cell._linear(do2d, self.tgt_vocab_size, True, 1.0) logprobs2d = tf.nn.log_softmax(logits2d) total_probs = logprobs2d + tf.reshape(beam_probs, [-1, 1]) total_probs_noEOS = tf.concat(1, [ tf.slice(total_probs, [0, 0], [batch_size, EOS_ID]), tf.tile([[-3e38]], [batch_size, 1]), tf.slice(total_probs, [0, EOS_ID + 1], [batch_size, self.tgt_vocab_size - EOS_ID - 1]) ]) flat_total_probs = tf.reshape(total_probs_noEOS, [-1]) beam_k = tf.minimum(tf.size(flat_total_probs), self.beam_size) next_beam_probs, top_indices = tf.nn.top_k(flat_total_probs, k=beam_k) next_bases = tf.floordiv(top_indices, self.tgt_vocab_size) next_mods = tf.mod(top_indices, self.tgt_vocab_size) next_states = [ tf.gather(state, next_bases) for state in state_output ] next_beam_seqs = tf.concat(1, [ tf.gather(beam_seqs, next_bases), tf.reshape(next_mods, [-1, 1]) ]) cand_seqs_pad = tf.pad(cand_seqs, [[0, 0], [0, 1]]) beam_seqs_EOS = tf.pad(beam_seqs, [[0, 0], [0, 1]]) new_cand_seqs = tf.concat(0, [cand_seqs_pad, beam_seqs_EOS]) EOS_probs = tf.slice(total_probs, [0, EOS_ID], [batch_size, 1]) new_cand_probs = tf.concat( 0, [cand_probs, tf.reshape(EOS_probs, [-1])]) cand_k = tf.minimum(tf.size(new_cand_probs), self.beam_size) next_cand_probs, next_cand_indices = tf.nn.top_k(new_cand_probs, k=cand_k) next_cand_seqs = tf.gather(new_cand_seqs, next_cand_indices) return [ time + 1, next_beam_probs, next_beam_seqs, next_cand_probs, next_cand_seqs ] + next_states
def __call__(self, inputs, parent_state, cyc_state, scope=None): """Modified Long short-term memory for tree structure""" with vs.variable_scope(scope or type(self).__name__): # "BasicTreeLSTMCell" # parameters of gates are concatenated into one multiply for efficiency parent_c, parent_h = parent_state cyc_c, cyc_h = cyc_state c = rnn_cell._linear([parent_c, cyc_c], self._num_units, True) concat = rnn_cell._linear([inputs, parent_h, cyc_h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = [c * rnn_cell.sigmoid(f + self._forget_bias) + rnn_cell.sigmoid(i) * self._activation(j)] new_h = self._activation(new_c) * rnn_cell.sigmoid(o) new_state = rnn_cell.LSTMStateTuple(new_c, new_h) return new_h, new_state
def __call__(self, inputs, state, scope=None): """Most basic RNN: output = new_state = activation(W * input + U * state + B).""" with vs.variable_scope(scope or type(self).__name__): # "BasicRNNCell" assert(self._dropMaskInput.get_shape()[1:] == inputs.get_shape()[1:]) assert(self._dropMaskState.get_shape()[1:] == state.get_shape()[1:]) dropin = tf.mul(self._dropMaskInput, inputs) dropst = tf.mul(self._dropMaskState, state) output = self._activation(rnn_cell._linear([dropin, dropst], self._num_units, True)) return output, output
def __call__(self, inputs, state, scope=None): """Does the stuff""" with tf.variable_scope(scope or type(self).__name__): h, mem = state concat = _linear(tf.concat(1, [h, inputs]), 4 * self._width, True, scope='keys') concat = tf.nn.tanh(concat) in_key, out_key = tf.split(1, 2, concat) in_val = _linear(inputs, self._width * 2, True, scope='input') in_val = in_val updated_mem = hrr.store(in_key, in_val, mem) output = hrr.retrieve(out_key, updated_mem) output = tf.nn.tanh(output) return output, (output, updated_mem)
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. c, h = array_ops.split(1, 2, state) concat = rnn_cell._linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * self._activation(j) new_h = self._activation(new_c) * tf.sigmoid(o) return new_h, array_ops.concat(1, [new_c, new_h])
def rnn_linear(all_states, dim, output_size, scope, reuse=False, return_param=False): with tf.variable_scope(scope, reuse=reuse) as v_s: # all_states: (batch_size, time, hidden_size) doshape = tf.shape(all_states) batch_size, unroll = doshape[0], doshape[1] flattened = tf.reshape(all_states, [-1, dim]) result2d = rnn_cell._linear(flattened, output_size=output_size, bias=True) result3d = tf.reshape(result2d, tf.pack([batch_size, unroll, -1])) if return_param: linear_params = [v for v in tf.global_variables() if v.name.startswith(v_s.name)] return result3d, linear_params return result3d
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. c, h = array_ops.split(1, 2, state) concat = rnn_cell._linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid( i) * self._activation(j) new_h = self._activation(new_c) * tf.sigmoid(o) return new_h, array_ops.concat(1, [new_c, new_h])
def attention(query): with tf.variable_scope("Attention"): # attention on query (decoder states) query_feature = rnn_cell._linear(query, bias=True, output_size=attention_size, scope="Att_W2") # reshape query_feature feature to (-1, 1, 1, attention_size) in order to do summation query_feature = tf.reshape(query_feature, (-1, 1, 1, attention_size)) # compute attention vector u, should be (batch_size, attention_len) s = tf.reduce_sum(v * tf.nn.tanh(query_feature + hidden_feature), reduction_indices=[2, 3]) return s
def setup_loss(self): with vs.variable_scope("Logistic"): doshape = tf.shape(self.decoder_output) T, batch_size = doshape[0], doshape[1] do2d = tf.reshape(self.decoder_output, [-1, self.size]) logits2d = rnn_cell._linear(do2d, self.vocab_size, True, 1.0) outputs2d = tf.nn.log_softmax(logits2d) self.outputs = tf.reshape(outputs2d, tf.pack([T, batch_size, self.vocab_size])) targets_no_GO = tf.slice(self.target_tokens, [1, 0], [-1, -1]) masks_no_GO = tf.slice(self.target_mask, [1, 0], [-1, -1]) # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1]) mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1]) losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(logits2d, labels1d) * tf.to_float(mask1d) losses2d = tf.reshape(losses1d, tf.pack([T, batch_size])) self.losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size)
def __call__(self, inputs, state, scope=None): """Variational recurrent neural network cell (VRNN).""" with tf.variable_scope(scope or type(self).__name__): # Update the hidden state. z_t, z_mean_t, z_log_sigma_sq_t = state h_t_1 = self._activation( _linear([inputs, z_t, z_mean_t, z_log_sigma_sq_t], 2 * self._num_units, True)) z_mean_t_1, z_log_sigma_sq_t_1 = tf.split(1, 2, h_t_1) # Sample. eps = tf.random_normal((tf.shape(inputs)[0], self._num_units), 0.0, 1.0, dtype=tf.float32) z_t_1 = tf.add(z_mean_t_1, tf.mul(tf.sqrt(tf.exp(z_log_sigma_sq_t_1)), eps)) return z_t_1, VRNNStateTuple(z_t_1, z_mean_t_1, z_log_sigma_sq_t_1)
def attention(query): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds
def downscale(self, inp, mask): with vs.variable_scope("Downscale"): inshape = tf.shape(inp) T, batch_size, dim = inshape[0], inshape[1], inshape[2] inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size]) out2d = rnn_cell._linear(inp2d, self.size, True, 1.0) out3d = tf.reshape(out2d, tf.pack((batch_size, tf.to_int32(T/2), dim))) out3d = tf.transpose(out3d, perm=[1, 0, 2]) out3d.set_shape([None, None, self.size]) out = tanh(out3d) mask = tf.transpose(mask) mask = tf.reshape(mask, [-1, 2]) mask = tf.cast(mask, tf.bool) mask = tf.reduce_any(mask, reduction_indices=1) mask = tf.to_int32(mask) mask = tf.reshape(mask, tf.pack([batch_size, -1])) mask = tf.transpose(mask) return out, mask
def attention_encode(self): # (length, batch_size, dim) query_w_matrix = self.normal_encode(self.encoder_inputs, self.source_mask) context_w_matrix = self.normal_encode(self.ctx_inputs, self.ctx_mask, reuse=True) # can add a query variation here (optional) # can take out coattention mix...but by experiment it should be better than no coattention # in PA4 it was also time-major # batch, p, size p_encoding = tf.transpose(context_w_matrix, perm=[1, 0, 2]) # batch, q, size q_encoding = tf.transpose(query_w_matrix, perm=[1, 0, 2]) # batch, size, q q_encoding_t = tf.transpose(query_w_matrix, perm=[1, 2, 0]) # 2). Q->P Attention # [256,25,125] vs [128,125,11] A = batch_matmul(p_encoding, q_encoding_t) # (batch, p, q) A_p = tf.nn.softmax(A) # 3). Paragrahp's context vectors C_p = batch_matmul(A_p, q_encoding) # 4). Linear mix of paragraph's context vectors and paragraph states flat_C_p = tf.reshape(C_p, [-1, self.FLAGS.size]) flat_p_enc = tf.reshape(p_encoding, [-1, self.FLAGS.size]) doshape = tf.shape(context_w_matrix) T, batch_size = doshape[0], doshape[1] # mixed_p: (batch * p_len, size) mixed_p = rnn_cell._linear([flat_C_p, flat_p_enc], self.FLAGS.size, bias=True) mixed_p = tf.reshape(mixed_p, tf.pack([T, -1, self.FLAGS.size])) # no extra layer of RNN on top of coattention result return mixed_p
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, is_train=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] if input_keep_prob < 1.0: assert is_train is not None flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg) for arg in flat_args] flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start, scope=scope) out = reconstruct(flat_out, args[0], 1) if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1]) if wd: add_wd(wd) return out
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. # if nest.is_sequence(query): # If the query is a tuple, flatten it. # query_list = nest.flatten(query) # for q in query_list: # Check that ndims == 2 if specified. # ndims = q.get_shape().ndims # if ndims: # assert ndims == 2 # query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds
def beam_step(time, beam_probs, beam_seqs, cand_probs, cand_seqs, *states): batch_size = tf.shape(beam_probs)[0] inputs = tf.reshape(tf.slice(beam_seqs, [0, time], [batch_size, 1]), [batch_size]) decoder_input = embedding_ops.embedding_lookup(self.L_dec, inputs) decoder_output, state_output = self.decoder_graph(decoder_input, states) with vs.variable_scope("Logistic", reuse=True): do2d = tf.reshape(decoder_output, [-1, self.size]) logits2d = rnn_cell._linear(do2d, self.vocab_size, True, 1.0) logprobs2d = tf.nn.log_softmax(logits2d) total_probs = logprobs2d + tf.reshape(beam_probs, [-1, 1]) total_probs_noEOS = tf.concat(1, [tf.slice(total_probs, [0, 0], [batch_size, nlc_data.EOS_ID]), tf.tile([[-3e38]], [batch_size, 1]), tf.slice(total_probs, [0, nlc_data.EOS_ID + 1], [batch_size, self.vocab_size - nlc_data.EOS_ID - 1])]) flat_total_probs = tf.reshape(total_probs_noEOS, [-1]) beam_k = tf.minimum(tf.size(flat_total_probs), self.beam_size) next_beam_probs, top_indices = tf.nn.top_k(flat_total_probs, k=beam_k) next_bases = tf.floordiv(top_indices, self.vocab_size) next_mods = tf.mod(top_indices, self.vocab_size) next_states = [tf.gather(state, next_bases) for state in state_output] next_beam_seqs = tf.concat(1, [tf.gather(beam_seqs, next_bases), tf.reshape(next_mods, [-1, 1])]) cand_seqs_pad = tf.pad(cand_seqs, [[0, 0], [0, 1]]) beam_seqs_EOS = tf.pad(beam_seqs, [[0, 0], [0, 1]]) new_cand_seqs = tf.concat(0, [cand_seqs_pad, beam_seqs_EOS]) EOS_probs = tf.slice(total_probs, [0, nlc_data.EOS_ID], [batch_size, 1]) new_cand_probs = tf.concat(0, [cand_probs, tf.reshape(EOS_probs, [-1])]) cand_k = tf.minimum(tf.size(new_cand_probs), self.beam_size) next_cand_probs, next_cand_indices = tf.nn.top_k(new_cand_probs, k=cand_k) next_cand_seqs = tf.gather(new_cand_seqs, next_cand_indices) return [time + 1, next_beam_probs, next_beam_seqs, next_cand_probs, next_cand_seqs] + next_states
def pointer_decoder(decoder_inputs, initial_state, attention_states, cell, feed_prev=True, dtype=dtypes.float32, scope=None): """RNN decoder with pointer net for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "pointer_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) with vs.variable_scope(scope or "point_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. input_size = decoder_inputs[0].get_shape()[1].value attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) attention_vec_size = attn_size # Size of query vectors for attention. k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = vs.get_variable("AttnV", [attention_vec_size]) states = [initial_state] def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) return s outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = array_ops.zeros(batch_attn_size, dtype=dtype) attns.set_shape([None, attn_size]) inps = [] for i in xrange(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] if feed_prev and i > 0: inp = tf.pack(decoder_inputs) inp = tf.transpose(inp, perm=[1, 0, 2]) inp = tf.reshape(inp, [-1, attn_length, input_size]) inp = tf.reduce_sum(inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1) inp = tf.stop_gradient(inp) inps.append(inp) # Use the same inputs in inference, order internaly # Merge input and previous attentions into one vector of the right size. x = rnn_cell._linear([inp, attns], cell.output_size, True) # Run the RNN. cell_output, new_state = cell(x, states[-1]) states.append(new_state) # Run the attention mechanism. output = attention(new_state) outputs.append(output) return outputs, states, inps
def attention_RNN(encoder_outputs, encoder_state, num_decoder_symbols, sequence_length, num_heads=1, dtype=dtypes.float32, use_attention=True, loop_function=None, scope=None): if use_attention: print ('Use the attention RNN model') if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") with variable_scope.variable_scope(scope or "attention_RNN"): output_size = encoder_outputs[0].get_shape()[1].value top_states = [array_ops.reshape(e, [-1, 1, output_size]) for e in encoder_outputs] attention_states = array_ops.concat(1, top_states) if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) batch_size = array_ops.shape(top_states[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) def attention(query): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) # loop through the encoder_outputs attention_encoder_outputs = list() sequence_attention_weights = list() for i in xrange(len(encoder_outputs)): if i > 0: variable_scope.get_variable_scope().reuse_variables() if i == 0: with variable_scope.variable_scope("Initial_Decoder_Attention"): initial_state = rnn_cell._linear(encoder_state, output_size, True) attn_weights, ds = attention(initial_state) else: attn_weights, ds = attention(encoder_outputs[i]) output = array_ops.concat(1, [ds[0], encoder_outputs[i]]) # NOTE: here we temporarily assume num_head = 1 with variable_scope.variable_scope("AttnRnnOutputProjection"): logit = rnn_cell._linear(output, num_decoder_symbols, True) attention_encoder_outputs.append(logit) # NOTE: here we temporarily assume num_head = 1 sequence_attention_weights.append(attn_weights[0]) # NOTE: here we temporarily assume num_head = 1 else: print ('Use the NON attention RNN model') with variable_scope.variable_scope(scope or "non-attention_RNN"): attention_encoder_outputs = list() sequence_attention_weights = list() # copy over logits once out of sequence_length if encoder_outputs[0].get_shape().ndims != 1: (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2) else: fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(encoder_outputs[0])[0] if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length is not None: # Prepare variables zero_logit = array_ops.zeros( array_ops.pack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype) zero_logit.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, num_decoder_symbols])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(encoder_outputs): if time > 0: variable_scope.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop # call_cell = lambda: cell(input_, state) generate_logit = lambda: rnn_cell._linear(encoder_outputs[time], num_decoder_symbols, True) # pylint: enable=cell-var-from-loop if sequence_length is not None: logit = _step( time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit) else: logit = generate_logit attention_encoder_outputs.append(logit) return attention_encoder_outputs, sequence_attention_weights
def __call__(self, inputs, state, scope=None): """Most basic RNN: output = new_state = tanh(W * input + U * state + B).""" with vs.variable_scope(scope or type(self).__name__): # "BasicRNNCell" output = self._activation(rnn_cell._linear([inputs, state], self._num_units, True)) return output, output
def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): # return attention_seq2seq(encoder_inputs, decoder_inputs, cell, self.input_size, self.hidden_size, self.output_size, feed_previous=do_decode) # return basic_seq2seq(encoder_inputs, decoder_inputs, cell, self.input_size, self.hidden_size, self.output_size) with variable_scope.variable_scope("my_seq2seq"): wrapper_cell = tf.nn.rnn_cell.InputProjectionWrapper(cell, self.hidden_size, self.input_size) encoder_outputs, enc_state = rnn.rnn(wrapper_cell, encoder_inputs, dtype=dtypes.float32) if do_decode: def simple_loop_function(prev, _): _next = tf.greater_equal(prev, 0.5) _next = tf.to_float(_next) return _next loop_function = simple_loop_function else: loop_function = None ################# # ATTENTION DECODER ################# # First calculate a concatenation of encoder outputs to put attention on. top_states = [array_ops.reshape(e, [-1, 1, wrapper_cell.output_size]) for e in encoder_outputs] attention_states = array_ops.concat(1, top_states) # return tf.nn.seq2seq.attention_decoder(decoder_inputs, enc_state, attention_states, wrapper_cell, output_size=self.output_size,loop_function=loop_function) initial_state = enc_state output_size = self.output_size num_heads = 1 dtype = dtypes.float32 scope = None initial_state_attention = False if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = wrapper_cell.output_size with variable_scope.variable_scope(scope or "attention_decoder") as scope: # dtype = scope.dtype batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. # if nest.is_sequence(query): # If the query is a tuple, flatten it. # query_list = nest.flatten(query) # for q in query_list: # Check that ndims == 2 if specified. # ndims = q.get_shape().ndims # if ndims: # assert ndims == 2 # query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = rnn_cell._linear([inp] + attns, input_size, True) # Run the RNN. cell_output, state = wrapper_cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = rnn_cell._linear([cell_output] + attns, output_size, True) output = tf.nn.sigmoid(output) if loop_function is not None: prev = output outputs.append(output) return outputs, state
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" s1 = vs.get_variable("s1", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32) s2 = vs.get_variable("s2", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32) s3 = vs.get_variable("s3", initializer=tf.ones([self._num_units]), dtype=tf.float32) b1 = vs.get_variable("b1", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32) b2 = vs.get_variable("b2", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32) b3 = vs.get_variable("b3", initializer=tf.zeros([self._num_units]), dtype=tf.float32) # s1 = tf.Variable(tf.ones([4 * self._num_units]), name="s1") # s2 = tf.Variable(tf.ones([4 * self._num_units]), name="s2") # s3 = tf.Variable(tf.ones([self._num_units]), name="s3") # # b1 = tf.Variable(tf.zeros([4 * self._num_units]), name="b1") # b2 = tf.Variable(tf.zeros([4 * self._num_units]), name="b2") # b3 = tf.Variable(tf.zeros([self._num_units]), name="b3") input_below_ = rnn_cell._linear([inputs], 4 * self._num_units, False, scope="out_1") input_below_ = ln(input_below_, s1, b1) state_below_ = rnn_cell._linear([m_prev], 4 * self._num_units, False, scope="out_2") state_below_ = ln(state_below_, s2, b2) lstm_matrix = tf.add(input_below_, state_below_) i, j, f, o = array_ops.split(1, 4, lstm_matrix) c = (sigmoid(f) * c_prev + sigmoid(i) * self._activation(j)) # Currently normalizing c causes lot of nan's in the model, thus commenting it out for now. # c_ = ln(c, s3, b3) c_ = c m = sigmoid(o) * self._activation(c_) new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat(1, [c, m])) return m, new_state