def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. ru = rnn_cell._linear([inputs, state], 2 * self._num_units, True, 1.0) ru = tf.nn.sigmoid(ru) r, u = tf.split(1, 2, ru) with tf.variable_scope("Candidate"): lambdas = rnn_cell._linear([inputs, state], self._num_weights, True) lambdas = tf.split(1, self._num_weights, tf.nn.softmax(lambdas)) Ws = tf.get_variable("Ws", shape=[ self._num_weights, inputs.get_shape()[1], self._num_units ]) Ws = [ tf.squeeze(i) for i in tf.split(0, self._num_weights, Ws) ] candidate_inputs = [] for idx, W in enumerate(Ws): candidate_inputs.append( tf.matmul(inputs, W) * lambdas[idx]) Wx = tf.add_n(candidate_inputs) c = tf.nn.tanh(Wx + rnn_cell._linear( [r * state], self._num_units, True, scope="second")) new_h = u * state + (1 - u) * c return new_h, new_h
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, keep_prob=None, is_train=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("args must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] if keep_prob is not None and is_train is not None: flat_args = [ tf.cond(is_train, lambda: tf.nn.dropout(arg, keep_prob), lambda: arg) for arg in flat_args ] with tf.variable_scope(scope or 'linear'): flat_out = _linear( flat_args, output_size, bias, bias_initializer=tf.constant_initializer(bias_start)) out = reconstruct(flat_out, args[0], 1) if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1]) return out
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, is_train=None): with tf.variable_scope(scope or "linear"): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] # if input_keep_prob < 1.0: assert is_train is not None flat_args = [ tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg) for arg in flat_args ] flat_out = _linear(flat_args, output_size, bias) out = reconstruct(flat_out, args[0], 1) if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1]) if wd: add_wd(wd) return out
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM). Args: inputs: (batch,n) tensor state: the states and hidden unit of the two cells Returns: new_state, new_inputs """ with vs.variable_scope(scope or type(self).__name__): c1, c2, h1, h2 = state # change bias argument to False since LN will add bias via shift concat = _linear([inputs, h1, h2], 5 * self._num_units, False) i, j, f1, f2, o = array_ops.split(concat, 5, 1) # add layer normalization to each gate i = ln(i, scope='i/') j = ln(j, scope='j/') f1 = ln(f1, scope='f1/') f2 = ln(f2, scope='f2/') o = ln(o, scope='o/') new_c = (c1 * nn.sigmoid(f1 + self._forget_bias) + c2 * nn.sigmoid(f2 + self._forget_bias) + nn.sigmoid(i) * self._activation(j)) # add layer_normalization in calculation of new hidden state new_h = self._activation(ln(new_c, scope='new_h/')) * nn.sigmoid(o) new_state = rnn.LSTMStateTuple(new_c, new_h) return new_h, new_state
def attention(query, use_attention=False): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) if use_attention is False: # apply mean pooling weights = tf.tile(sequence_length, tf.pack([attn_length])) weights = array_ops.reshape(weights, tf.shape(s)) a = array_ops.ones( tf.shape(s), dtype=dtype) / math_ops.to_float(weights) # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1]) else: a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with tf.variable_scope(scope or type(self).__name__): c, h = state # change bias argument to False since LN will add bias via shift concat = rnn_cell._linear([inputs, h], 4 * self._num_units, False) i, j, f, o = tf.split(1, 4, concat) # add layer normalization to each gate i = ln(i, scope='i/') j = ln(j, scope='j/') f = ln(f, scope='f/') o = ln(o, scope='o/') new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) * self._activation(j)) # add layer_normalization in calculation of new hidden state new_h = self._activation(ln(new_c, scope='new_h/')) * tf.nn.sigmoid(o) new_state = LSTMStateTuple(new_c, new_h) return new_h, new_state
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or "grnnsp_cell"): c, h = state with tf.variable_scope("gates"): u_c, u_h, r_c, r_w = array_ops.split( split_dim=1, num_split=4, value=tf.sigmoid( _linear([inputs, c], 4 * self._num_units, True, 1.0))) with tf.variable_scope("inputs"): j_c = tf.tanh( _linear([inputs, r_c * c], self._num_units, True, scope="input_c")) j_h = tf.tanh( _linear(inputs, self._num_units, True, scope="input_h")) new_c = u_c * c + (1 - u_c) * j_c new_h = u_h * h + (1 - u_h) * j_h new_state = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h) return new_h, new_state
def __call__(self, inputs, state, scope=None): step_t, state = state with vs.variable_scope(self._scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates_X"): # rx, ux = tf.split(1, 2, rnn_cell._linear([inputs], # 2 * self._num_units, False)) # rh, uh = tf.split(1, 2, tf.matmul(state, self._Wgh) + self._Bgh) rx, ux = tf.split(rnn_cell._linear([inputs], 2 * self._num_units, False), num_or_size_splits=2, axis=1,) rh, uh = tf.split(tf.matmul(state, self._Wgh) + self._Bgh, num_or_size_splits=2, axis=1,) r, u = rx + rh, ux + uh r, u = sigmoid(r), sigmoid(u) with vs.variable_scope("Candidate"): cx = rnn_cell._linear([inputs], self._num_units, False) c = cx + tf.matmul(state * r, self._Wch) + self._Bch c = self._activation(c) new_h = u * state + (1 - u) * c active = (step_t % self._period) == 0 new_h = active * new_h + (1 - active) * state return new_h, [new_h]
def attention(query): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds
def attention(query): if nest.is_sequence(query): query_list = nest.flatten(query) query = tf.concat(query_list,1) with tf.variable_scope("Attention") as scope: y = _linear( args=query, output_size=attn_size, bias=True) y = tf.reshape(y, [-1, 1, 1, attn_size]) s = tf.reduce_sum( attention_softmax_weights * tf.nn.tanh(hidden_features + y), [2, 3]) a = tf.nn.softmax(s) c = tf.reduce_sum(tf.reshape( a, [-1, attn_length, 1, 1])*hidden, [1,2]) cs=tf.reshape(c, [-1, attn_size]) return cs,a
def __call__(self, inputs, state, scope=None): """Conditional long short-term memory cell (CLSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(1, 2, state) concat = _linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j)) new_h = self._activation(new_c) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat(1, [new_c, new_h]) return new_h, new_state
def attention(query): """Put attention masks on hidden using hidden_features and query.""" weights = [] ds = [] # Results of attention reads will be stored here. # if tf.nest.is_sequence(query): # If the query is a tuple, flatten it. # query_list = tf.nest.flatten(query) # for q in query_list: # Check that ndims == 2 if specified. # ndims = q.get_shape().ndims # if ndims: # assert ndims == 2 # query = tf.concat(query_list, 1) for i in xrange(num_heads): with tf.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = tf.reduce_sum(v[i] * tf.tanh(hidden_features[i] + y), [2, 3]) a = tf.nn.softmax(s) weights.append(a) # Now calculate the attention-weighted vector d. d = tf.reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(tf.reshape(d, [-1, attn_size])) return weights, ds
def att_weights(inputs, state, memory): ''' :param inputs: [N, i] :param state: [N, d] :param memory: [N, J, i] :return: [N, J] ''' rank = len(memory.get_shape()) memory_size = tf.shape(memory)[rank-2] tiled_inputs = tf.tile(tf.expand_dims(inputs, 1), [1, memory_size, 1]) if isinstance(state, tuple): tiled_state = [tf.tile(tf.expand_dims(each, 1), [1, memory_size, 1]) for each in state] else: tiled_state = [tf.tile(tf.expand_dims(state, 1), [1, memory_size, 1])] in_ = tf.concat([tiled_inputs] + tiled_state + [memory], 2) flat_in = flatten(in_, 1) flat_in = [tf.nn.dropout(flat_in, input_keep_prob)] flat_out = _linear(flat_in, 1, bias) out = reconstruct(flat_out, in_, 1) out = tf.squeeze(out, [len(in_.get_shape().as_list())-1]) return out
def __call__(self, inputs, state, d_act, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: try: c, h = array_ops.split(1, 2, state) except: c, h = array_ops.split(state, 2, 1) concat = _linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate try: i, j, f, o = array_ops.split(1, 4, concat) except: i, j, f, o = array_ops.split(concat, 4, 1) w_d = vs.get_variable('w_d', [self.key_words_voc_size, self._num_units]) new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j)) + tf.tanh( tf.matmul(d_act, w_d)) new_h = self._activation(new_c) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: try: new_state = array_ops.concat(1, [new_c, new_h]) except: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def attention_RNN(encoder_outputs, encoder_state, num_decoder_symbols, sequence_length, num_heads=1, dtype=dtypes.float32, use_attention=True, loop_function=None, scope=None): if use_attention: print ('Use the attention RNN model') if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") with variable_scope.variable_scope(scope or "attention_RNN"): output_size = encoder_outputs[0].get_shape()[1].value top_states = [array_ops.reshape(e, [-1, 1, output_size]) for e in encoder_outputs] attention_states = array_ops.concat(top_states, 1) if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) batch_size = array_ops.shape(top_states[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) def attention(query): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) # loop through the encoder_outputs attention_encoder_outputs = list() sequence_attention_weights = list() for i in xrange(len(encoder_outputs)): if i > 0: variable_scope.get_variable_scope().reuse_variables() if i == 0: with variable_scope.variable_scope("Initial_Decoder_Attention"): initial_state = rnn_cell._linear(encoder_state, output_size, True) attn_weights, ds = attention(initial_state) else: attn_weights, ds = attention(encoder_outputs[i]) output = array_ops.concat([ds[0], encoder_outputs[i]], 1) # NOTE: here we temporarily assume num_head = 1 with variable_scope.variable_scope("AttnRnnOutputProjection"): logit = rnn_cell._linear(output, num_decoder_symbols, True) attention_encoder_outputs.append(logit) # NOTE: here we temporarily assume num_head = 1 sequence_attention_weights.append(attn_weights[0]) # NOTE: here we temporarily assume num_head = 1 else: print ('Use the NON attention RNN model') with variable_scope.variable_scope(scope or "non-attention_RNN"): attention_encoder_outputs = list() sequence_attention_weights = list() # copy over logits once out of sequence_length if encoder_outputs[0].get_shape().ndims != 1: (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2) else: fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(encoder_outputs[0])[0] if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length is not None: # Prepare variables zero_logit = array_ops.zeros( array_ops.pack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype) zero_logit.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, num_decoder_symbols])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(encoder_outputs): if time > 0: variable_scope.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop # call_cell = lambda: cell(input_, state) generate_logit = lambda: rnn_cell._linear(encoder_outputs[time], num_decoder_symbols, True) # pylint: enable=cell-var-from-loop if sequence_length is not None: logit = _step( time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit) else: logit = generate_logit attention_encoder_outputs.append(logit) return attention_encoder_outputs, sequence_attention_weights
def AttenOutputProject(_output,_vocab_size): with tf.variable_scope("AttnRnnOutputProjection"): _logit = rnn_cell._linear(_output, _vocab_size, True) # Tensor shape: [batch_size, num_cls] return _logit
def rnn_with_output_feedback(cell, inputs, targets1, targets1_num_symbols, target1_emb_size, target1_output_projection, targets2, targets2_num_symbols, target2_emb_size, target2_output_projection, word_emb_size, DNN_at_output, zero_intent_thres=0, sequence_length=None, dtype=None, train_with_true_label=True, use_predicted_output=False): ''' zero_intent_thres: int, the intent contribution to context remain zero before this thres, and linear increase to 1 after that. ''' if not isinstance(cell, tf.contrib.rnn.RNNCell): raise TypeError("cell must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not isinstance(targets1, list): raise TypeError("targets1 must be a list") if not isinstance(targets2, list): raise TypeError("targets2 must be a list") if not inputs: raise ValueError("inputs must not be empty") if not dtype: raise ValueError( "dtype must be provided, which is to used in defining intial RNN state" ) encoder_outputs = [] intent_embedding = variable_scope.get_variable( "intent_embedding", [targets1_num_symbols, target1_emb_size]) tag_embedding = variable_scope.get_variable( "tag_embedding", [targets2_num_symbols, target2_emb_size]) # use predicted label if use_predicted_output during inference, use true label during training # To choose to always use predicted label, disable the if condition intent_loop_function = _extract_argmax_and_embed( intent_embedding, DNN_at_output, target1_output_projection, forward_only=use_predicted_output) #if use_predicted_output else None tagging_loop_function = _extract_argmax_and_embed( tag_embedding, DNN_at_output, target2_output_projection, forward_only=use_predicted_output) intent_targets = [ array_ops.reshape(math_ops.to_int64(x), [-1]) for x in targets1 ] intent_target_embeddings = list() intent_target_embeddings = [ embedding_ops.embedding_lookup(intent_embedding, target) for target in intent_targets ] tag_targets = [ array_ops.reshape(math_ops.to_int64(x), [-1]) for x in targets2 ] tag_target_embeddings = list() tag_target_embeddings = [ embedding_ops.embedding_lookup(tag_embedding, target) for target in tag_targets ] if inputs[0].get_shape().ndims != 1: (fixed_batch_size, input_size) = inputs[0].get_shape().with_rank(2) if input_size.value is None: raise ValueError( "Input size (second dimension of inputs[0]) must be accessible via " "shape inference, but saw value None.") else: fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(inputs[0])[0] state = cell.zero_state(batch_size, dtype) zero_output = array_ops.zeros( array_ops.stack([batch_size, cell.output_size]), inputs[0].dtype) zero_output.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, cell.output_size])) if sequence_length is not None: # Prepare variables sequence_length = math_ops.to_int32(sequence_length) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) # prev_cell_output = zero_output zero_intent_embedding = array_ops.zeros( array_ops.stack([batch_size, target1_emb_size]), inputs[0].dtype) zero_intent_embedding.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, target1_emb_size])) zero_tag_embedding = array_ops.zeros( array_ops.stack([batch_size, target2_emb_size]), inputs[0].dtype) zero_tag_embedding.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, target2_emb_size])) encoder_outputs = list() intent_logits = list() tagging_logits = list() sampled_intent_embeddings = list() sampled_tag_embeddings = list() for time, input_ in enumerate(inputs): # Bing: introduce output label embeddings as addtional input # if feed_previous (during testing): # Use loop_function # if NOT feed_previous (during training): # Use true target embedding if time == 0: current_intent_embedding = zero_intent_embedding current_tag_embedding = zero_tag_embedding if time > 0: variable_scope.get_variable_scope().reuse_variables() # here we introduce a max(0, t-4)/sequence_length intent weight thres = zero_intent_thres if time <= thres: intent_contribution = math_ops.to_float(0) else: intent_contribution = tf.div(math_ops.to_float(time - thres), math_ops.to_float(sequence_length)) # intent_contribution = math_ops.to_float(1) x = rnn_cell._linear([ tf.transpose( tf.transpose(current_intent_embedding) * intent_contribution), current_tag_embedding, input_ ], word_emb_size, True) call_cell = lambda: cell(x, state) # pylint: enable=cell-var-from-loop if sequence_length is not None: (output_fw, state) = rnn._rnn_step(time, sequence_length, min_sequence_length, max_sequence_length, zero_output, state, call_cell, cell.state_size) else: (output_fw, state) = call_cell() encoder_outputs.append(output_fw) if use_predicted_output: intent_logit, current_intent_embedding = intent_loop_function( output_fw, time) tagging_logit, current_tag_embedding = tagging_loop_function( output_fw, time) else: if train_with_true_label is True: intent_logit = multilayer_perceptron_with_initialized_W( output_fw, target1_output_projection, forward_only=use_predicted_output) tagging_logit = multilayer_perceptron_with_initialized_W( output_fw, target2_output_projection, forward_only=use_predicted_output) current_intent_embedding = intent_target_embeddings[time] current_tag_embedding = tag_target_embeddings[time] else: intent_logit, current_intent_embedding = intent_loop_function( output_fw, time) tagging_logit, current_tag_embedding = tagging_loop_function( output_fw, time) # prev_symbols.append(prev_symbol) if time == 0: current_intent_embedding = zero_intent_embedding current_tag_embedding = zero_tag_embedding sampled_intent_embeddings.append(current_intent_embedding) sampled_tag_embeddings.append(current_tag_embedding) intent_logits.append(intent_logit) tagging_logits.append(tagging_logit) return encoder_outputs, state, intent_logits, tagging_logits, sampled_intent_embeddings, sampled_tag_embeddings
Places an attention mask on hidden states from encoder using hidden and query. Query is a state of shape [N, H] """ # results of the attention reads cs = [] # context vectors c_i # Flatten the query if it is a tuple if nest.is_sequence(query): # converts query from [N, H] to list of size N if [H, 1] query_list = nest.flatten(query) query = tf.concat(1, query_list) # becomes [H, N] <<<<<<< HEAD for i in range(num_heads): with tf.variable_scope("Attention_%d" % i) as scope: y = _linear( args=query, output_size=pre_attn_size, bias=True) # Reshape into 4D y = tf.reshape(y, [-1, 1, 1, pre_attn_size]) # [N, 1, 1, H] # Calculating alpha s = tf.reduce_sum(V1[i] * tf.nn.tanh(hidden_features_pre[i] + y), [2, 3]) ======= for a in range(num_heads): with tf.variable_scope("Attention_%d" % a) as scope: y = tf.nn.rnn_cell._linear( args=query, output_size=attn_size, bias=True) # Reshape into 4D y = tf.reshape(y, [-1, 1, 1, attn_size]) # [N, 1, 1, H]
def __call__(self, inputs, state, scope=None): """Basic RNN: output = new_state = clipped_relu(W * input + U * state + B).""" with vs.variable_scope(scope or type(self).__name__): output = clipped_relu(_linear([inputs, state], self._num_units, True)) return output, output
def __init__(self, batch_size, num_input, num_hidden, layer_depth, rnn_type, seq_length, learning_rate, keep_drop=0.5, grad_clip=5.0, is_training=False): self.num_input = num_input self.num_hidden = num_hidden self.seq_length = seq_length self.batch_size = batch_size self.rnn_type = rnn_type self.layer_depth = layer_depth self.learning_rate = learning_rate self.grad_clip = grad_clip self.is_training = is_training self.keep_drop = keep_drop self.x = tf.placeholder(tf.float32, [batch_size, seq_length, self.num_input]) # LSTM cells for encoder and decoder def create_cell(): if rnn_type == "GRU": cell = rnn.GRUCell(num_hidden) elif rnn_type == "RAN": cell = RANCell(num_hidden, normalize=tf.constant(self.is_training)) cell = SwitchableDropoutWrapper(cell, output_keep_prob=self.keep_drop, is_train=tf.constant( self.is_training)) return cell with tf.variable_scope( 'encoder_cells', initializer=tf.contrib.layers.xavier_initializer()): self.enc_cell = rnn.DeviceWrapper(rnn.MultiRNNCell( [create_cell() for _ in range(layer_depth)]), device="/gpu:0") with tf.variable_scope( 'decoder_cells', initializer=tf.contrib.layers.xavier_initializer()): self.dec_cell = rnn.DeviceWrapper(rnn.MultiRNNCell( [create_cell() for _ in range(layer_depth)]), device="/gpu:1") with tf.variable_scope('encoder'): outputs, _ = tf.nn.dynamic_rnn(cell=self.enc_cell, inputs=self.x, time_major=False, swap_memory=True, dtype=tf.float32) self.enc_output = outputs[:, -1, :] with tf.variable_scope('latent'): # reparametrization trick with tf.name_scope("Z"): self.z_mean = tf.contrib.layers.fully_connected( inputs=self.enc_output, num_outputs=num_hidden, activation_fn=None, scope="z_mean") self.z_stddev = tf.contrib.layers.fully_connected( inputs=self.enc_output, num_outputs=num_hidden, activation_fn=tf.nn.softplus, scope="z_ls2") # sample z from the latent distribution with tf.name_scope("z_samples"): with tf.name_scope('random_normal_sample'): eps = tf.random_normal( (batch_size, num_hidden), 0, 1, dtype=tf.float32) # draw a random number with tf.name_scope('z_sample'): self.z = self.z_mean + tf.sqrt( self.z_stddev) * eps # a sample it from Z -> z with tf.variable_scope('decoder'): reversed_inputs = tf.reverse(self.x, [1]) flat_targets = tf.reshape(reversed_inputs, [-1]) dec_first_inp = tf.nn.relu(_linear(self.z, self.num_input, True)) # [GO, ...inputs] dec_inputs = tf.concat( (tf.expand_dims(dec_first_inp, 1), reversed_inputs[:, 1:, :]), 1) self.w1 = tf.get_variable( "w1", shape=[self.num_hidden, self.num_input], initializer=tf.contrib.layers.xavier_initializer()) self.b1 = tf.get_variable("b1", shape=[self.num_input], initializer=tf.constant_initializer(0.0)) self.initial_state = self.dec_cell.zero_state(batch_size, dtype=tf.float32) dec_outputs, _ = tf.nn.dynamic_rnn( cell=self.dec_cell, inputs=dec_inputs, initial_state=self.initial_state, time_major=False, swap_memory=True, dtype=tf.float32) logist = tf.matmul(tf.reshape(dec_outputs, [-1, self.num_hidden]), self.w1) + self.b1 self.reconstruction = tf.reshape(logist, [-1]) self.reconstruction_loss = 0.5 * tf.reduce_mean( tf.pow(self.reconstruction - flat_targets, 2.0)) self.latent_loss = -0.5 * (1.0 + tf.log(self.z_stddev) - tf.square(self.z_mean) - self.z_stddev) self.latent_loss = tf.reduce_sum(self.latent_loss, 1) / tf.cast( seq_length, tf.float32) self.latent_loss = tf.reduce_sum(self.latent_loss) / tf.cast( batch_size, tf.float32) self.cost = tf.reduce_mean(self.reconstruction_loss + self.latent_loss) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip) optimizer = tf.train.AdamOptimizer(learning_rate, epsilon=0.001) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def prepare_model(self): with tf.variable_scope("LSTMTDNN"): self.char_inputs = [] self.word_inputs = [] self.cnn_outputs = [] if self.use_char: char_W = tf.get_variable( "char_embed", [self.char_vocab_size, self.char_embed_dim]) if self.use_word: word_W = tf.get_variable( "word_embed", [self.word_vocab_size, self.word_embed_dim]) with tf.variable_scope("CNN") as scope: self.char_inputs = tf.placeholder( tf.int32, [self.batch_size, self.seq_length, self.max_word_length]) self.word_inputs = tf.placeholder( tf.int32, [self.batch_size, self.seq_length]) char_indices = tf.split(axis=1, num_or_size_splits=self.seq_length, value=self.char_inputs) word_indices = tf.split(axis=1, num_or_size_splits=self.seq_length, value=tf.expand_dims( self.word_inputs, -1)) for idx in xrange(self.seq_length): char_index = tf.reshape(char_indices[idx], [-1, self.max_word_length]) word_index = tf.reshape(word_indices[idx], [-1, 1]) if idx != 0: scope.reuse_variables() if self.use_char: # [batch_size x word_max_length, char_embed] char_embed = tf.nn.embedding_lookup(char_W, char_index) char_cnn = TDNN(char_embed, self.char_embed_dim, self.feature_maps, self.kernels) if self.use_word: word_embed = tf.nn.embedding_lookup( word_W, word_index) cnn_output = tf.concat(axis=1, values=[ char_cnn.output, tf.squeeze( word_embed, [1]) ]) else: cnn_output = char_cnn.output else: cnn_output = tf.squeeze( tf.nn.embedding_lookup(word_W, word_index)) if self.use_batch_norm: bn = batch_norm() norm_output = bn( tf.expand_dims(tf.expand_dims(cnn_output, 1), 1)) cnn_output = tf.squeeze(norm_output) if highway: #cnn_output = highway(input_, input_dim_length, self.highway_layers, 0) cnn_output = highway(cnn_output, cnn_output.get_shape()[1], self.highway_layers, 0) self.cnn_outputs.append(cnn_output) with tf.variable_scope("LSTM") as scope: self.cell = tf.contrib.rnn.BasicLSTMCell(self.rnn_size) self.stacked_cell = tf.contrib.rnn.MultiRNNCell( [self.cell] * self.layer_depth) outputs, _ = tf.contrib.rnn.static_rnn(self.stacked_cell, self.cnn_outputs, dtype=tf.float32) self.lstm_outputs = [] self.true_outputs = tf.placeholder( tf.int64, [self.batch_size, self.seq_length]) loss = 0 true_outputs = tf.split(axis=1, num_or_size_splits=self.seq_length, value=self.true_outputs) for idx, (top_h, true_output) in enumerate(zip(outputs, true_outputs)): if self.dropout_prob > 0: top_h = tf.nn.dropout(top_h, self.dropout_prob) if self.hsm > 0: self.lstm_outputs.append(top_h) else: if idx != 0: scope.reuse_variables() proj = _linear(top_h, self.word_vocab_size, 0) self.lstm_outputs.append(proj) loss += tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.lstm_outputs[idx], labels=tf.squeeze(true_output)) self.loss = tf.reduce_mean(loss) / self.seq_length tf.summary.scalar("loss", self.loss) tf.summary.scalar("perplexity", tf.exp(self.loss))
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size, loop_function=None, dtype=None, scope=None): """ Decoder with attention mechanism args: decoder_inputs: The inputs to the decoder, either the targets during training or the previous decoder output during inference. initial_state: The tensor used to initialize the first decoder step cell. attention_states: The encoder hidden states on which the decoder is supposed to attend to. cell: The decoder cell returned by the rnn_cell function. output_size: The number of decoder hidden state units. loop_function: The function that embeds the previous decoder step's output and provides as input to next decoder step dtype: the data type scope: the scope of the attention decoder """ with tf.variable_scope(scope or 'attention_decoder', dtype=dtype) as scope: dtype = scope.dtype batch_size = tf.shape(decoder_inputs[0])[0] attn_length = attention_states.get_shape()[1].value if attn_length == None: attn_length = tf.shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value hidden = tf.reshape(attention_states, [-1, attn_length, 1, attn_size]) k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size]) hidden_features=tf.nn.conv2d(hidden, k, [1,1,1,1], "SAME") attention_softmax_weights=tf.get_variable( "W_attention_softmax", [attn_size]) state = initial_state[0] def attention(query): if nest.is_sequence(query): query_list = nest.flatten(query) query = tf.concat(query_list,1) with tf.variable_scope("Attention") as scope: y = _linear( args=query, output_size=attn_size, bias=True) y = tf.reshape(y, [-1, 1, 1, attn_size]) s = tf.reduce_sum( attention_softmax_weights * tf.nn.tanh(hidden_features + y), [2, 3]) a = tf.nn.softmax(s) c = tf.reduce_sum(tf.reshape( a, [-1, attn_length, 1, 1])*hidden, [1,2]) cs=tf.reshape(c, [-1, attn_size]) return cs,a outputs = [] prev = None batch_attn_size = tf.stack([batch_size, attn_size]) attns = tf.zeros(batch_attn_size, dtype=dtype) attns.set_shape([None, attn_size]) wts_l=[] for i, inp in enumerate(decoder_inputs): if i > 0: tf.get_variable_scope().reuse_variables() if loop_function is not None and prev is not None: with tf.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) input_size = inp.get_shape().with_rank(2)[1] #project decoder inputs and context vector to decoder input size x = _linear( args=[inp]+[attns], output_size=input_size, bias=True) #Run a decoder step cell_outputs, state = cell(x, state) attns,wts = attention([state]) wts_l.append(wts) #project the decoder outputs and context vector to decoder output size with tf.variable_scope('attention_output_projection'): output = _linear( args=[cell_outputs]+[attns], output_size=output_size, bias=True) if loop_function is not None: prev = output outputs.append(output) return outputs, state , wts_l