def __call__(self, inputs, state, name=None): with tf.variable_scope(tf.get_variable_scope()): # print(inputs) batch_size = self.h.get_shape().as_list()[0] seq_len = tf.shape(self.h)[1] emb_size = self.h.get_shape().as_list()[2] #tf.shape(self.h)[2] flat_h = tf.reshape(self.h, [-1, emb_size]) #flatten h bs_times_seqlen = flat_h.get_shape().as_list()[ 0] #tf.shape(flat_h)[0] tile_state = tf.tile(state, [seq_len, 1]) '''2 linear layer should be seperated, because of the use of kernel''' with tf.variable_scope("val"): val = self._activation( _linear([tile_state, flat_h], self.state_size, False)) with tf.variable_scope("s"): s = _linear([val], 1, False) s = tf.reshape(s, [batch_size, -1]) #[batch_size, seq_len] a = tf.nn.softmax(s, 1) #[batch_size, seq_length] a = tf.reshape(a, [-1]) flat_h = tf.transpose(flat_h) cont = flat_h * a cont = tf.transpose(cont) cont = tf.reshape(cont, [batch_size, -1, emb_size]) cont = tf.reduce_sum(cont, 1) #[batch_size, emb_size] new_inputs = tf.concat([inputs, cont], 1) print('success') return self._cell(new_inputs, state)
def call(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" # inputs = realinputs + m +rt # rt's length is self._num_units # state = rt * older state # input = first 2 part totalLength = inputs.get_shape().as_list()[1] inputs_ = inputs[:, 0:totalLength - self._num_units] rth = inputs[:, totalLength - self._num_units:] inputs = inputs_ state = math_ops.multiply(rth, state) with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) value = math_ops.sigmoid( _linear([inputs, state], 2 * self._num_units, True, bias_ones, self._kernel_initializer)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) with vs.variable_scope("candidate"): c = self._activation( _linear([inputs, r * state], self._num_units, True, self._bias_initializer, self._kernel_initializer)) new_h = u * state + (1 - u) * c return new_h, new_h
def call(self, inputs=None, state=None): """Gated recurrent unit (GRU) with nunits cells.""" a_, c_ = tf_ops.get_max_pooling(self._num_units, self.contexts, inputs, state) with vs.variable_scope("sigmoid_gate"): if self.concat_context: inputs = tf.concat([inputs, c_]) g_ = tf.nn.sigmoid( _linear([inputs], self._num_units * 2, False)) else: inputs = c_ g_ = tf.nn.sigmoid(_linear([inputs], self._num_units, False)) inputs = tf.multiply(inputs, g_) # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) value = math_ops.sigmoid( _linear([inputs, state], 2 * self._num_units, True, bias_ones, self._kernel_initializer)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) with vs.variable_scope("candidate"): c = self._activation( _linear([inputs, r * state], self._num_units, True, self._bias_initializer, self._kernel_initializer)) new_h = u * state + (1 - u) * c if self.result_type is 'pred': outputs = a_ else: outputs = new_h return outputs, new_h
def call(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = inputs.dtype bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) # pylint: disable=protected-access value = math_ops.sigmoid( rnn_cell_impl._linear([inputs, state], 2 * self._num_units, True, bias_ones, self._kernel_initializer)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) # pylint: enable=protected-access with vs.variable_scope("candidate"): # pylint: disable=protected-access with vs.variable_scope("input_projection"): hi = rnn_cell_impl._linear(inputs, self._num_units, True, self._bias_initializer, self._kernel_initializer) with vs.variable_scope("hidden_projection"): hh = r * (rnn_cell_impl._linear(state, self._num_units, True, self._bias_initializer, self._kernel_initializer)) # pylint: enable=protected-access c = self._activation(hi + hh) new_h = u * state + (1 - u) * c return new_h, new_h
def call(self, inputs, state): with vs.variable_scope("gates"): bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) value = _linear([inputs, state], 2 * self._hidden_size, True, bias_ones, aux.rum_ortho_initializer()) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) u = sigmoid(u) if self._use_layer_norm: concat = tf.concat([r, u], 1) concat = aux.layer_norm_all(concat, 2, self._hidden_size, "LN_r_u") r, u = tf.split(concat, 2, 1) with vs.variable_scope("candidate"): x_emb = _linear(inputs, self._hidden_size, True, self._bias_initializer, self._kernel_initializer) state_new = rotate(x_emb, r, state) if self._use_layer_norm: c = self._activation(aux.layer_norm(x_emb + state_new, "LN_c")) else: c = self._activation(x_emb + state_new) new_h = u * state + (1 - u) * c if self._T_norm != None: new_h = tf.nn.l2_normalize(new_h, 1, epsilon=self._eps) * self._T_norm if self._use_zoneout: new_h = aux.rum_zoneout(new_h, state, self._zoneout_keep_h, self._is_training) return new_h, new_h
def call(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = inputs.dtype bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) # pylint: disable=protected-access value = math_ops.sigmoid( rnn_cell_impl._linear([inputs, state], 2 * self._num_units, True, bias_ones, self._kernel_initializer)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) # pylint: enable=protected-access with vs.variable_scope("candidate"): # pylint: disable=protected-access with vs.variable_scope("input_projection"): hi = rnn_cell_impl._linear(inputs, self._num_units, True, self._bias_initializer, self._kernel_initializer) with vs.variable_scope("hidden_projection"): hh = r * (rnn_cell_impl._linear(state, self._num_units, True, self._bias_initializer, self._kernel_initializer)) # pylint: enable=protected-access c = self._activation(hi + hh) new_h = u * state + (1 - u) * c return new_h, new_h
def hyper_bias(self, layer, hyper_output, embedding_size, num_units, scope="hyper"): with tf.variable_scope(scope): with tf.variable_scope('zb'): zb = _linear(hyper_output, embedding_size, False) with tf.variable_scope('beta'): beta = _linear(zb, num_units, False) return layer + beta
def __call__(self, inputs, state, scope=None): """Run one step of LRU. Args: inputs: input Tensor, 2D, batch x num_units. state: a state Tensor, `2-D, batch x state_size`. Returns: A tuple containing: - A `2-D, [batch x num_units]`, Tensor representing the output of the LRU after reading `inputs` when previous state was `state`. - A `2-D, [batch x num_units]`, Tensor representing the new state of LRU after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: - If input size cannot be inferred from inputs via static shape inference. - If state is not `2D`. """ if inputs.get_shape()[1] != self._num_units: with tf.variable_scope("input_transformation"): W = tf.get_variable("kernel", [inputs.get_shape()[1], self._num_units], initializer = self._kernel_initializer) inputs = tf.matmul(inputs, W) ## r_1, r_2, z_1 and z_2 update & reset gates (resp. eq. 11, 12, 15 & 16) with tf.variable_scope("gates"): # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) value = math_ops.sigmoid( _linear([inputs, state], 4 * self._num_units, True, bias_ones, self._kernel_initializer)) r1, r2, z1, z2 = array_ops.split(value=value, num_or_size_splits=4, axis=1) ## h1_hat with tf.variable_scope("projected_state1"): bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) h1_hat = tf.tanh( _linear([inputs, r2 * state], self._num_units, True, bias_ones, self._kernel_initializer)) ## h2_hat with tf.variable_scope("projected_state2"): bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) h2_hat = tf.tanh( _linear([r1 * inputs, state], self._num_units, True, bias_ones, self._kernel_initializer)) h1_prime = z1 * h2_hat + (1 - z1) * inputs h2_prime = z2 * h1_hat + (1 - z2) * state return h1_prime, h2_prime
def encode(input_embeds): '''Encoder''' with tf.variable_scope("h"): h = tf.nn.tanh(_linear(input_embeds, hp.M * hp.K / 2, True)) with tf.variable_scope("logits"): logits = _linear(h, hp.M * hp.K, True) logits = tf.log(tf.nn.softplus(logits) + 1e-8) logits = tf.reshape(logits, [-1, hp.M, hp.K], name="logits") return logits
def _encode(self, input_matrix, word_ids, embed_size): input_embeds = tf.nn.embedding_lookup(input_matrix, word_ids, name="input_embeds") with tf.variable_scope("h"): h = tf.nn.tanh(_linear(input_embeds, self.M*self.K/2, True)) with tf.variable_scope("logits"): logits = _linear(h, M * K, True) logits = tf.log(tf.nn.softplus(logits) + 1e-8) logits = tf.reshape(logits, [-1, M, K], name="logits") return input_embeds, logits
def __call__(self, inputs, state, scope=None): gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope) with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn2"): gamma_h = tanh(rnn_cell_impl._linear(gru_out, self._num_units, False)) weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True) weights = tf.exp(weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True)) weights = weights / (1e-6 + tf.reduce_sum(weights, reduction_indices=0, keep_dims=True)) context = tf.reduce_sum(self.hs * weights, reduction_indices=0) with vs.variable_scope("AttnConcat"): out = tf.nn.relu(rnn_cell_impl._linear([context, gru_out], self._num_units, False)) self.attn_map = tf.squeeze(tf.slice(weights, [0, 0, 0], [-1, -1, 1])) return (out, out)
def call(self, inputs, state): #extract the associative memory and the state size_batch = tf.shape(state)[0] assoc_mem, state = tf.split( state, [self._hidden_size * self._hidden_size, self._hidden_size], 1) assoc_mem = tf.reshape( assoc_mem, [size_batch, self._hidden_size, self._hidden_size]) with vs.variable_scope("gates"): bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) value = _linear([inputs, state], 2 * self._hidden_size, True, bias_ones, aux.rum_ortho_initializer()) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) u = sigmoid(u) if self._use_layer_norm: concat = tf.concat([r, u], 1) concat = aux.layer_norm_all(concat, 2, self._hidden_size, "LN_r_u") r, u = tf.split(concat, 2, 1) with vs.variable_scope("candidate"): x_emb = _linear(inputs, self._hidden_size, True, self._bias_initializer, self._kernel_initializer) tmp_rotation = rotation_operator(x_emb, r, self._hidden_size) Rt = tf.matmul(assoc_mem, tmp_rotation) state_new = tf.reshape( tf.matmul( Rt, tf.reshape(state, [size_batch, self._hidden_size, 1])), [size_batch, self._hidden_size]) if self._use_layer_norm: c = self._activation(aux.layer_norm(x_emb + state_new, "LN_c")) else: c = self._activation(x_emb + state_new) new_h = u * state + (1 - u) * c if self._T_norm != None: new_h = tf.nn.l2_normalize(new_h, 1, epsilon=self._eps) * self._T_norm if self._use_zoneout: new_h = aux.rum_zoneout(new_h, state, self._zoneout_keep_h, self._is_training) Rt = tf.reshape(Rt, [size_batch, self._hidden_size * self._hidden_size]) new_state = tf.concat([Rt, new_h], 1) return new_h, new_state
def call(self, inputs, state): """Long short-term memory cell (LSTM). Args: inputs: `2-D` tensor with shape `[batch_size x input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size x self.state_size]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size x 2 * self.state_size]`. Returns: A pair containing the new hidden state, and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ sigmoid = math_ops.sigmoid # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1) concat = _linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) new_c = ( c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j)) new_h = self._activation(new_c) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def call(self, input, state): """Cached LSTM input, [N, dw] state, [N, k*d] """ sigmoid = tf.nn.sigmoid if self._state_is_tuple: state, c = state N = tf.shape(state)[0] k = self._num_groups d = self._num_units / k state = list(tf.split(state, k, axis=1)) # [N*d, N*d, ,,, N*d] k item in total ro = rnn_cell_impl._linear([input] + state, 3*k*d, True, self._bias_initializer, self._kernel_initializer) # [N, 3*k*d] r, o, c = tf.split(ro, 3, axis=1) # [N, k*d] r = sigmoid(r) # [N, k*d] r = tf.add(tf.divide(tf.reshape(r, [N, k, d]), k), \ tf.tile(tf.expand_dims(tf.expand_dims(tf.range(0, 1, delta=1.0/float(k), dtype=tf.float32), 0), 2), [N, 1, d])) # r = tf.add(tf.divide(tf.reshape(r, [N, k, d]), k), tf.range(0, 1, delta=1.0/float(k), dtype=tf.float32)) r = tf.reshape(r, [N, k*d]) o = sigmoid(o) c_ = self._activation(c) new_c = (1 - r) * c + r * c_ new_state = self._activation(new_c) * o if self._state_is_tuple: new_state = LSTMStateTuple(new_state, new_c) return new_state, new_state
def call(self, input, state): d = self._num_units sigmoid = tf.nn.sigmoid if self._state_is_tuple: c_tm1, h_tm1 = state with tf.variable_scope('input'): input_ = rnn_cell_impl._linear(input, d, False, self._bias_initializer, self._kernel_initializer) with tf.variable_scope('fr'): fr = rnn_cell_impl._linear(input_, 2*d, True, self._bias_initializer, self._kernel_initializer) fr = sigmoid(fr) f, r = tf.split(fr, 2, axis=1) # [N, d] c_t = f * c_tm1 + (1 - f) * input h_t = r * self._activation(c_t) + (1 - r_t) * input_ if self._state_is_tuple: new_state = LSTMStateTuple(c_t, h_t) return h_t, new_state
def attention(query, use_attention=False): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell_impl._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) if use_attention is False: # apply mean pooling weights = tf.tile(sequence_length, tf.pack([attn_length])) weights = array_ops.reshape(weights, tf.shape(s)) a = array_ops.ones( tf.shape(s), dtype=dtype) / math_ops.to_float(weights) # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1]) else: a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or "gru_cell", reuse=self._reuse): #We start with bias of 1.0 to not reset and not update. #todo: implement the new_h calculation given inputs and state with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. concated = _linear([inputs, state], 2 * self._num_units, True, init_ops.constant_initializer( 1.0, dtype=tf.float32)) r, u = array_ops.split(concated, 2, 1) r, u = tf.sigmoid(r), tf.sigmoid(u) with tf.variable_scope("Candidate"): c = self._activation( _linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM). @param: inputs (batch,n) @param state: the states and hidden unit of the two cells """ with tf.variable_scope(scope or type(self).__name__): c1, c2, h1, h2 = state # change bias argument to False since LN will add bias via shift concat = _linear([inputs, h1, h2], 5 * self._num_units, False) i, j, f1, f2, o = tf.split(value=concat, num_or_size_splits=5, axis=1) # add layer normalization to each gate i = ln(i, scope='i/') j = ln(j, scope='j/') f1 = ln(f1, scope='f1/') f2 = ln(f2, scope='f2/') o = ln(o, scope='o/') new_c = (c1 * tf.nn.sigmoid(f1 + self._forget_bias) + c2 * tf.nn.sigmoid(f2 + self._forget_bias) + tf.nn.sigmoid(i) * self._activation(j)) # add layer_normalization in calculation of new hidden state new_h = self._activation(ln(new_c, scope='new_h/')) * tf.nn.sigmoid(o) new_state = LSTMStateTuple(new_c, new_h) return new_h, new_state
def downscale(self, inp, mask): """ 改变input和mask的shape,为了构建Pyramid Structure RNN :param inp: shape() :param mask: shape() :return: out: shape() mask: shape() """ # return inp, mask with vs.variable_scope("Downscale"): inshape = tf.shape(inp) T, batch_size, dim = inshape[0], inshape[1], inshape[2] inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size]) # 变成2*size out2d = rnn_cell_impl._linear(inp2d, self.size, True) # 变成size out3d = tf.reshape(out2d, tf.stack((batch_size, tf.to_int32(T / 2), dim))) out3d = tf.transpose(out3d, perm=[1, 0, 2]) out3d.set_shape([None, None, self.size]) out = tanh(out3d) mask = tf.transpose(mask) mask = tf.reshape(mask, [-1, 2]) mask = tf.cast(mask, tf.bool) # 数据类型覆盖成tf.bool(Boolean) mask = tf.reduce_any(mask, reduction_indices=1) mask = tf.to_int32(mask) mask = tf.reshape(mask, tf.stack([batch_size, -1])) mask = tf.transpose(mask) return out, mask
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, is_train=None): with K.tf.variable_scope(scope or "linear"): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] # if input_keep_prob < 1.0: assert is_train is not None # flat_args = [K.tf.cond(is_train, lambda: K.tf.nn.dropout(arg, input_keep_prob), lambda: arg) # for arg in flat_args] flat_args = [ K.tf.nn.dropout(arg, input_keep_prob) for arg in flat_args ] flat_out = _linear(flat_args, output_size, bias) # flat_out = K.python.ops.rnn_cell._linear(flat_args, output_size, bias) out = reconstruct(flat_out, args[0], 1) if squeeze: out = K.tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1]) # if wd: # add_wd(wd) return out
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, is_train=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] if input_keep_prob < 1.0: assert is_train is not None flat_args = [ tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg) for arg in flat_args ] with tf.variable_scope(scope or 'Linear'): flat_out = _linear( flat_args, output_size, bias, bias_initializer=tf.constant_initializer(bias_start)) out = reconstruct(flat_out, args[0], 1) if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1]) if wd: add_wd(wd) return out
def call(self, inputs, state): # inputs: [batch_size, in_size] # state: [batch_size, output_size OR state_size] with tf.variable_scope("attention"): with tf.variable_scope("main_input"): # [batch_size, 1, att_size] att_main_in = tf.expand_dims(tf.einsum('ij,jk->ik', inputs, self._WP), axis=1) with tf.variable_scope("s"): # [batch_size, num_match_elems, att_size] raw_in = tf.add(att_main_in, self._att_match_input) # [batch_size, num_match_elems, 1] s = tf.einsum('ijk,k->ij', tf.nn.tanh(raw_in), self._v) # [batch_size, num_match_elems] a = tf.nn.softmax(s, dim=1) # [batch_size, match_size] c = tf.reduce_sum(tf.multiply(tf.expand_dims(a, axis=2), self._match_input), axis=1) raw_rnn_inputs = tf.concat([inputs, c], axis=1) with tf.variable_scope("pre_input_gate"): rnn_input_size = int(raw_rnn_inputs.get_shape()[1]) rnn_input_gate = tf.sigmoid( _linear([raw_rnn_inputs], rnn_input_size, False)) rnn_inputs = tf.multiply(raw_rnn_inputs, rnn_input_gate) new_h, new_h = self._base_cell.call(inputs=rnn_inputs, state=state) return new_h, new_h
def __call__(self, inputs, state, scope=None): sigmoid = math_ops.sigmoid tanh = math_ops.tanh with tf.variable_scope(scope or type(self).__name__): with tf.variable_scope("r"): r = sigmoid(_linear([inputs, state], self._num_units, True)) with tf.variable_scope("z"): z = sigmoid(_linear([inputs, state], self._num_units, True)) with tf.variable_scope("h_tilde"): h_tilde = tanh( _linear([inputs, r * state], self._num_units, True)) new_h = (z * state) + ((1 - z) * h_tilde) return new_h, new_h
def graph(embedding_npy, M, K): vocab_size = embedding_npy.shape[0] emb_size = embedding_npy.shape[1] num_centroids = 2**K tau = tf.placeholder_with_default(np.array(1.0, dtype='float32'), tuple()) - 0.1 embedding = tf.constant(embedding_npy, name="embedding") word_input = tf.placeholder_with_default(np.array([3, 4, 5], dtype="int32"), shape=[None], name="word_input") word_lookup = tf.nn.embedding_lookup(embedding, word_input, name="word_lookup") A = tf.get_variable("codebook", [M * num_centroids, emb_size]) with tf.variable_scope("h"): h = tf.nn.tanh(_linear(word_lookup, M * num_centroids / 2, True)) with tf.variable_scope("logits"): logits_lookup = _linear(h, M * num_centroids, True) logits_lookup = tf.log(tf.nn.softplus(logits_lookup) + 1e-8) logits_lookup = tf.reshape(logits_lookup, [-1, M, num_centroids], name="logits_lookup") D = gumbel_softmax(logits_lookup, tau, hard=False) D_prime = tf.reshape(D, [-1, M * num_centroids]) y = tf.matmul(D_prime, A) loss = 0.5 * tf.reduce_sum((y - word_lookup)**2, axis=1) loss = tf.reduce_mean(loss, name="loss") global_step = tf.Variable(0, name='global_step', trainable=False) learning_rate = tf.Variable(0.0, trainable=False, name='learning_rate') max_grad_norm = 0.001 tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) grads, global_norm = tf.clip_by_global_norm(grads, max_grad_norm) global_norm = tf.identity(global_norm, name="global_norm") optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step, name="train_op") return word_input, tau, learning_rate, train_op, loss, global_norm, D
def call(self, inputs, state): d, cont = state with tf.variable_scope(tf.get_variable_scope()): batch_size = self.h.get_shape().as_list()[0] seq_len = tf.shape(self.h)[1] emb_size = self.h.get_shape().as_list()[2] #tf.shape(self.h)[2] flat_h = tf.reshape(self.h, [-1, emb_size]) #flatten h bs_times_seqlen = flat_h.get_shape().as_list()[ 0] #tf.shape(flat_h)[0] tile_state = tf.tile(d, [seq_len, 1]) '''2 linear layer should be seperated, because of the use of kernel''' with tf.variable_scope("val"): val = self._activation( _linear([tile_state, flat_h], self.state_size[0], True)) with tf.variable_scope("s"): s = _linear([val], 1, True, bias_initializer=tf.constant_initializer(0)) s = tf.reshape(s, [batch_size, -1]) #[batch_size, seq_len] a = tf.nn.softmax(s, 1) #[batch_size, seq_length] a = tf.reshape(a, [-1]) flat_h = tf.transpose(flat_h) cont = flat_h * a cont = tf.transpose(cont) cont = tf.reshape(cont, [batch_size, -1, emb_size]) cont = tf.reduce_sum(cont, 1) #[batch_size, emb_size] new_inputs = tf.concat([inputs, cont], 1) '''u: z_t''' with tf.variable_scope("gates"): # Reset gate and update gate. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [new_inputs, d]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) value = math_ops.sigmoid( _linear([new_inputs, d], 2 * self._num_units, True, bias_ones, self._kernel_initializer)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) with tf.variable_scope("candidate"): c = self._activation( _linear([new_inputs, r * d], self._num_units, True, self._bias_initializer, self._kernel_initializer)) new_h = u * d + (1 - u) * c return tf.concat([new_h, cont], 1), tuple([new_h, cont])
def call(self, inputs, state): if Config.ATTN_TYPE == Config.attn_multimodal: # Attend using prev lstm state lstm_state, prev_attn_softmax = state input_i = inputs[:,0,:] input_a = inputs[:,1,:] attn_dim = Config.NCELLS inputs_attended, cur_attn_softmax = self._attention_multimodal(input_i, input_a, lstm_state.h, attn_dim, self.FUSION_CONC) # LSTM lstm_output, lstm_state = self._cell(inputs_attended, lstm_state) lstm_output = (lstm_output, cur_attn_softmax) # Postprocess new_state = RNNInputStateHandler.get_state_tuple({'lstm_state' : lstm_state, 'attn_state' : cur_attn_softmax}, is_global_state=False) elif Config.ATTN_TYPE == Config.attn_temporal: # Attend using prev attn_state state, attn_state, attn_state_hist = state input_size = inputs.get_shape().as_list()[1]# [0] is batch size, [1] is feature size inputs_attended = rnn_cell_impl._linear(args=[inputs, attn_state], output_size=input_size, bias=True) # LSTM lstm_output, lstm_state = self._cell(inputs_attended, state) # Attention for next timestep new_state_cat = tf.concat(nest.flatten(lstm_state), 1) # NOTE this is [c,h] being used for _attention_temporal (not just h) attn_state_hist = tf.reshape(attn_state_hist, [-1, Config.ATTN_TEMPORAL_WINDOW, Config.ATTN_STATE_NCELLS]) new_attn_state, new_attn_state_hist = self._attention_temporal(new_state_cat, attn_state_hist) # Projection layer if self._project_output: with tf.variable_scope("attn_output_projection"): output = rnn_cell_impl._linear(args=[lstm_output, new_attn_state], output_size=Config.ATTN_STATE_NCELLS, bias=True) else: output = new_attn_state # Postprocess new_attn_state_hist = tf.concat( [new_attn_state_hist, tf.expand_dims(output, 1)], 1) # Concats latest output to new_attn_state_hist new_attn_state_hist = tf.reshape( new_attn_state_hist, [-1, Config.ATTN_TEMPORAL_WINDOW * Config.ATTN_STATE_NCELLS]) new_state = RNNInputStateHandler.get_state_tuple({'lstm_state' : lstm_state, 'attn_state' : new_attn_state, 'attn_state_hist' : new_attn_state_hist}, is_global_state=False) else: raise ValueError('Invalid Config.ATTN_TYPE selected. Check Config.py!') return lstm_output, new_state
def __call__(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) value = rnn_cell_impl._linear([inputs, state], 2 * self._num_units, True, bias_ones,\ self._kernel_initializer) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) r,u=layer_normalization(r,scope="r/"),layer_normalization(u,scope="u/") r,u=math_ops.sigmoid(r),math_ops.sigmoid(u) with vs.variable_scope("candidate"): c = self._activation(rnn_cell_impl._linear([inputs, r * state], self._num_units, True, self._bias_initializer, self._kernel_initializer)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) value = rnn_cell_impl._linear([inputs, state], 2 * self._num_units, True, bias_ones,\ self._kernel_initializer) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) r,u=layer_normalization(r,scope="r/"),layer_normalization(u,scope="u/") r,u=math_ops.sigmoid(r),math_ops.sigmoid(u) with vs.variable_scope("candidate"): c = self._activation(rnn_cell_impl._linear([inputs, r * state], self._num_units, True, self._bias_initializer, self._kernel_initializer)) new_h = u * state + (1 - u) * c return new_h, new_h
def __init__(self, num_units, encoder_output, scope=None): self.hs = encoder_output with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn1"): hs2d = tf.reshape(self.hs, [-1, num_units]) phi_hs2d = tanh(rnn_cell_impl._linear(hs2d, num_units, False)) self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs)) super(GRUCellAttn, self).__init__(num_units)
def call(self, inputs, state): """Most basic RNN: output = new_state = act(W * input + U * state + B).""" from tensorflow.python.ops.rnn_cell_impl import _linear output = self._activation( _linear([inputs, state], output_size=self._num_units, bias=False if self._bias_initializer is None else True, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer)) return output, output
def __init__(self, num_units, encoder_output, scope=None): self.hs = encoder_output with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn1"): hs2d = tf.reshape(self.hs, [-1, num_units]) phi_hs2d = tanh(rnn_cell_impl._linear(hs2d, num_units, False)) self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs)) super(GRUCellAttn, self).__init__(num_units)
def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = rnn_cell_impl._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y), [2, 3]) return s
def call(self, inputs, state): """ Conditionl GRU operations inputs: [batch_size, num_units] state: (h=[batch_size, num_units], c=[batch_size, num_units]) output: [batch_size, num_units] new_state: (h=[batch_size, num_units], c=[batch_size, num_units]) """ h = state.h c = state.c bias_ones = self._bias_initializer if self._bias_initializer is None: bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype) with vs.variable_scope('gates'): val_concat = rnn_cell_impl._linear( [inputs, h, c], 2 * self._num_units, bias=False, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) val = math_ops.sigmoid(val_concat) r, z = array_ops.split(value=val, num_or_size_splits=2, axis=1) r_state = r * h with vs.variable_scope('candidate'): hbar_out = rnn_cell_impl._linear( [inputs, r_state, c], self._num_units, bias=False, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) hbar = self._activation(hbar_out) output = (1 - z) * h + z * hbar new_state = ConditionalGRUState(h=output, c=c) return output, new_state
def call(self, inputs, state): inputs, encoded_question = inputs i = state.i state = state.h with tf.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) value = math_ops.sigmoid( _linear([inputs, state], 2 * self._num_units, True, bias_ones, self._kernel_initializer)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) with tf.variable_scope("candidate"): c = self._activation( _linear([inputs, r * state], self._num_units, True, self._bias_initializer, self._kernel_initializer)) new_h = u * state + (1 - u) * c self._history.append(new_h) with tf.variable_scope('attention'): question_dim = encoded_question.shape.as_list() hq = tf.tile(tf.expand_dims(encoded_question, 1), [1, self._time_step, 1]) padding = tf.constant(0.0, shape=(self._batch_size, self._time_step - len(self._history), self._num_units)) gru_h = tf.transpose(tf.convert_to_tensor(self._history), [1,0,2]) gru_h = tf.concat([gru_h, padding], axis=1) hq = tf.reshape(hq, [-1, question_dim[-1]]) gru_h = tf.reshape(gru_h, [-1, self._num_units]) with tf.variable_scope('inner'): att = tf.tanh(_linear([gru_h, hq], self._att_hidden, True, self._bias_initializer, self._kernel_initializer)) with tf.variable_scope('outer'): att = _linear([att], 1, False, self._bias_initializer, self._kernel_initializer) att = tf.reshape(att, [self._batch_size, self._time_step]) att_mask = np.zeros([self._batch_size, self._time_step], dtype=np.float32) att_mask[:,i:] = 10000.0 att_mask = tf.convert_to_tensor(att_mask) att = tf.reshape(tf.nn.softmax(att - att_mask), [-1, 1]) final_h = tf.reduce_sum(tf.reshape(tf.multiply(gru_h, att), [self._batch_size, self._time_step, self._num_units]), axis=1) self._history[-1] = final_h return final_h, SXMState(h=final_h,i=i+1)
def call(self, inputs, state): # inputs: [batch_size, in_size] # state: [batch_size, output_size OR state_size] with tf.variable_scope("attention"): with tf.variable_scope("main_input"): # [batch_size, 1, layer_size] att_main_in = tf.expand_dims(_linear([inputs], self._num_units, self._use_att_bias), axis=1) with tf.variable_scope("state_input"): # [batch_size, 1, layer_size] att_state_in = tf.expand_dims(_linear([state], self._num_units, False), axis=1) with tf.variable_scope("s"): att_vec = tf.get_variable('att_vec', [self._num_units]) # [batch_size, num_match_elems, layer_size] if self._use_state_for_att: raw_in = tf.add(tf.add(att_main_in, att_state_in), self._att_match_in) else: raw_in = tf.add(att_main_in, self._att_match_in) # [batch_size, num_match_elems, 1] s = tf.einsum('ijk,k->ij', tf.nn.tanh(raw_in), att_vec) # [batch_size, num_match_elems] a = tf.nn.softmax(s, dim=1) # [batch_size, match_size] c = tf.reduce_sum(tf.multiply(tf.expand_dims(a, axis=2), self._match_input), axis=1) raw_rnn_inputs = tf.concat([inputs, c], axis=1) with tf.variable_scope("output_gate"): rnn_input_size = int(raw_rnn_inputs.get_shape()[1]) rnn_input_gate = tf.sigmoid( _linear([raw_rnn_inputs], rnn_input_size, False)) rnn_inputs = tf.multiply(raw_rnn_inputs, rnn_input_gate) new_h, new_h = self._base_cell.call(inputs=rnn_inputs, state=state) return new_h, new_h
def call(self, inputs, state): """Long short-term memory cell with attention (LSTMA).""" state, attns, attn_states = state attn_states = array_ops.reshape(attn_states, [-1, self._attn_length, self._attn_size]) input_size = self._input_size if input_size is None: input_size = inputs.shape.as_list()[1] inputs = _linear([inputs, attns], input_size, True) lstm_output, new_state = self._cell(inputs, state) new_state_cat = array_ops.concat(nest.flatten(new_state), 1) new_attns, new_attn_states = self._attention(new_state_cat, attn_states) with tf.variable_scope("attn_output_projection"): output = _linear([lstm_output, new_attns], self._attn_size, True) new_attn_states = array_ops.concat( [new_attn_states, array_ops.expand_dims(output, 1)], 1) new_attn_states = array_ops.reshape( new_attn_states, [-1, self._attn_length * self._attn_size]) new_state = (new_state, new_attns, new_attn_states) return output, new_state
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, keep_prob=None, is_train=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("args must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] if keep_prob is not None and is_train is not None: flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, keep_prob), lambda: arg) for arg in flat_args] with tf.variable_scope(scope or 'linear'): flat_out = _linear(flat_args, output_size, bias, bias_initializer=tf.constant_initializer(bias_start)) out = reconstruct(flat_out, args[0], 1) if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1]) return out
def setup_loss(self): with vs.variable_scope("Logistic"): doshape = tf.shape(self.decoder_output) T, batch_size = doshape[0], doshape[1] do2d = tf.reshape(self.decoder_output, [-1, self.size]) logits2d = rnn_cell_impl._linear(do2d, self.vocab_size, False) outputs2d = tf.nn.log_softmax(logits2d) self.outputs = tf.reshape(outputs2d, tf.stack([T, batch_size, self.vocab_size])) targets_no_GO = tf.slice(self.target_tokens, [1, 0], [-1, -1]) masks_no_GO = tf.slice(self.target_mask, [1, 0], [-1, -1]) # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1]) mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1]) losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits2d, labels=labels1d) * tf.to_float( mask1d) losses2d = tf.reshape(losses1d, tf.stack([T, batch_size])) self.losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size)
def downscale(self, inp, mask): # return inp, mask with vs.variable_scope("Downscale"): inshape = tf.shape(inp) T, batch_size, dim = inshape[0], inshape[1], inshape[2] inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size]) out2d = rnn_cell_impl._linear(inp2d, self.size, False) out3d = tf.reshape(out2d, tf.stack((batch_size, tf.to_int32(T / 2), dim))) out3d = tf.transpose(out3d, perm=[1, 0, 2]) out3d.set_shape([None, None, self.size]) out = tanh(out3d) mask = tf.transpose(mask) mask = tf.reshape(mask, [-1, 2]) mask = tf.cast(mask, tf.bool) mask = tf.reduce_any(mask, reduction_indices=1) mask = tf.to_int32(mask) mask = tf.reshape(mask, tf.stack([batch_size, -1])) mask = tf.transpose(mask) return out, mask
def _attention(self, query, attn_states): conv2d = nn_ops.conv2d reduce_sum = math_ops.reduce_sum softmax = nn_ops.softmax tanh = math_ops.tanh with tf.variable_scope("attention"): k = tf.get_variable( "attn_w", [1, 1, self._attn_size, self._attn_vec_size]) v = tf.get_variable("attn_v", [self._attn_vec_size]) hidden = array_ops.reshape(attn_states, [-1, self._attn_length, 1, self._attn_size]) hidden_features = conv2d(hidden, k, [1, 1, 1, 1], "SAME") y = _linear(query, self._attn_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, self._attn_vec_size]) s = reduce_sum(v * tanh(hidden_features + y), [2, 3]) a = softmax(s) d = reduce_sum( array_ops.reshape(a, [-1, self._attn_length, 1, 1]) * hidden, [1, 2]) new_attns = array_ops.reshape(d, [-1, self._attn_size]) new_attn_states = array_ops.slice(attn_states, [0, 1, 0], [-1, -1, -1]) return new_attns, new_attn_states
def beam_step(time, beam_probs, beam_seqs, cand_probs, cand_seqs, *states): batch_size = tf.shape(beam_probs)[0] inputs = tf.reshape(tf.slice(beam_seqs, [0, time], [batch_size, 1]), [batch_size]) decoder_input = embedding_ops.embedding_lookup(self.L_dec, inputs) decoder_output, state_output = self.decoder_graph(decoder_input, states) with vs.variable_scope("Logistic", reuse=True): do2d = tf.reshape(decoder_output, [-1, self.size]) logits2d = rnn_cell_impl._linear(do2d, self.vocab_size, False) logprobs2d = tf.nn.log_softmax(logits2d) total_probs = logprobs2d + tf.reshape(beam_probs, [-1, 1]) total_probs_noEOS = tf.concat([tf.slice(total_probs, [0, 0], [batch_size, EOS_ID]), tf.tile([[-3e38]], [batch_size, 1]), tf.slice(total_probs, [0, EOS_ID + 1], [batch_size, self.vocab_size - EOS_ID - 1])], 1) flat_total_probs = tf.reshape(total_probs_noEOS, [-1]) beam_k = tf.minimum(tf.size(flat_total_probs), self.beam_size) next_beam_probs, top_indices = tf.nn.top_k(flat_total_probs, k=beam_k) next_bases = tf.floordiv(top_indices, self.vocab_size) next_mods = tf.mod(top_indices, self.vocab_size) next_states = [tf.gather(state, next_bases) for state in state_output] next_beam_seqs = tf.concat([tf.gather(beam_seqs, next_bases), tf.reshape(next_mods, [-1, 1])], 1) cand_seqs_pad = tf.pad(cand_seqs, [[0, 0], [0, 1]]) beam_seqs_EOS = tf.pad(beam_seqs, [[0, 0], [0, 1]]) new_cand_seqs = tf.concat([cand_seqs_pad, beam_seqs_EOS], 0) EOS_probs = tf.slice(total_probs, [0, EOS_ID], [batch_size, 1]) new_cand_probs = tf.concat([cand_probs, tf.reshape(EOS_probs, [-1])], 0) cand_k = tf.minimum(tf.size(new_cand_probs), self.beam_size) next_cand_probs, next_cand_indices = tf.nn.top_k(new_cand_probs, k=cand_k) next_cand_seqs = tf.gather(new_cand_seqs, next_cand_indices) return [time + 1, next_beam_probs, next_beam_seqs, next_cand_probs, next_cand_seqs] + next_states
def createModel(input_data, input_size, sequence_length, slot_size, intent_size, layer_size = 128, isTraining = True): cell_fw = tf.contrib.rnn.BasicLSTMCell(layer_size) cell_bw = tf.contrib.rnn.BasicLSTMCell(layer_size) if isTraining == True: cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=0.5, output_keep_prob=0.5) cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=0.5, output_keep_prob=0.5) embedding = tf.get_variable('embedding', [input_size, layer_size]) inputs = tf.nn.embedding_lookup(embedding, input_data) state_outputs, final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=sequence_length, dtype=tf.float32) final_state = tf.concat([final_state[0][0], final_state[0][1], final_state[1][0], final_state[1][1]], 1) state_outputs = tf.concat([state_outputs[0], state_outputs[1]], 2) state_shape = state_outputs.get_shape() with tf.variable_scope('attention'): slot_inputs = state_outputs if remove_slot_attn == False: with tf.variable_scope('slot_attn'): attn_size = state_shape[2].value origin_shape = tf.shape(state_outputs) hidden = tf.expand_dims(state_outputs, 1) hidden_conv = tf.expand_dims(state_outputs, 2) # hidden shape = [batch, sentence length, 1, hidden size] k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size]) hidden_features = tf.nn.conv2d(hidden_conv, k, [1, 1, 1, 1], "SAME") hidden_features = tf.reshape(hidden_features, origin_shape) hidden_features = tf.expand_dims(hidden_features, 1) v = tf.get_variable("AttnV", [attn_size]) slot_inputs_shape = tf.shape(slot_inputs) slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) y = rnn_cell_impl._linear(slot_inputs, attn_size, True) y = tf.reshape(y, slot_inputs_shape) y = tf.expand_dims(y, 2) s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [3]) a = tf.nn.softmax(s) # a shape = [batch, input size, sentence length, 1] a = tf.expand_dims(a, -1) slot_d = tf.reduce_sum(a * hidden, [2]) else: attn_size = state_shape[2].value slot_inputs = tf.reshape(slot_inputs, [-1, attn_size]) intent_input = final_state with tf.variable_scope('intent_attn'): attn_size = state_shape[2].value hidden = tf.expand_dims(state_outputs, 2) k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size]) hidden_features = tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = tf.get_variable("AttnV", [attn_size]) y = rnn_cell_impl._linear(intent_input, attn_size, True) y = tf.reshape(y, [-1, 1, 1, attn_size]) s = tf.reduce_sum(v*tf.tanh(hidden_features + y), [2,3]) a = tf.nn.softmax(s) a = tf.expand_dims(a, -1) a = tf.expand_dims(a, -1) d = tf.reduce_sum(a * hidden, [1, 2]) if add_final_state_to_intent == True: intent_output = tf.concat([d, intent_input], 1) else: intent_output = d with tf.variable_scope('slot_gated'): intent_gate = rnn_cell_impl._linear(intent_output, attn_size, True) intent_gate = tf.reshape(intent_gate, [-1, 1, intent_gate.get_shape()[1].value]) v1 = tf.get_variable("gateV", [attn_size]) if remove_slot_attn == False: slot_gate = v1 * tf.tanh(slot_d + intent_gate) else: slot_gate = v1 * tf.tanh(state_outputs + intent_gate) slot_gate = tf.reduce_sum(slot_gate, [2]) slot_gate = tf.expand_dims(slot_gate, -1) if remove_slot_attn == False: slot_gate = slot_d * slot_gate else: slot_gate = state_outputs * slot_gate slot_gate = tf.reshape(slot_gate, [-1, attn_size]) slot_output = tf.concat([slot_gate, slot_inputs], 1) with tf.variable_scope('intent_proj'): intent = rnn_cell_impl._linear(intent_output, intent_size, True) with tf.variable_scope('slot_proj'): slot = rnn_cell_impl._linear(slot_output, slot_size, True) outputs = [slot, intent] return outputs