def f(inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias): features = nn_ops.relu( nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features") logits = nn_ops.xw_plus_b( features, softmax_weight, softmax_bias, name="logits") labels = constant_op.constant( label_data.tolist(), shape=[batch, classes], dtype=dtypes.float64, name="labels") cost = nn_ops.softmax_cross_entropy_with_logits( labels=labels, logits=logits, name="cost") return cost
def sequence_softmax(inputs, noutput, scope=None, name=None, linear_name=None): """Run a softmax layer over all the time steps of an input sequence. Args: inputs: (length, batch_size, depth) tensor noutput: output depth scope: optional scope name name: optional name for output tensor linear_name: name for linear (pre-softmax) output Returns: A tensor of size (length, batch_size, noutput). """ length, _, ninputs = _shape(inputs) inputs_u = array_ops.unstack(inputs) output_u = [] with variable_scope.variable_scope(scope, "SequenceSoftmax", [inputs]): initial_w = random_ops.truncated_normal([0 + ninputs, noutput], stddev=0.1) initial_b = constant_op.constant(0.1, shape=[noutput]) w = variables.model_variable("weights", initializer=initial_w) b = variables.model_variable("biases", initializer=initial_b) for i in xrange(length): with variable_scope.variable_scope(scope, "SequenceSoftmaxStep", [inputs_u[i]]): # TODO(tmb) consider using slim.fully_connected(..., # activation_fn=tf.nn.softmax) linear = nn_ops.xw_plus_b(inputs_u[i], w, b, name=linear_name) output = nn_ops.softmax(linear) output_u += [output] outputs = array_ops.stack(output_u, name=name) return outputs
def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1]) prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1)) return embedding_ops.embedding_lookup(embedding, prev_symbol)
def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols): if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1]) # prev= prev.get_shape().with_rank(2)[1] probs = tf.log(tf.nn.softmax(prev)) if i > 1: probs = tf.reshape(probs + log_beam_probs[-1], [-1, beam_size * num_symbols]) best_probs, indices = tf.nn.top_k(probs, beam_size) indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1]))) best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1])) symbols = indices % num_symbols # Which word in vocabulary. beam_parent = indices // num_symbols # Which hypothesis it came from. beam_symbols.append(symbols) beam_path.append(beam_parent) log_beam_probs.append(best_probs) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = embedding_ops.embedding_lookup(embedding, symbols) emb_prev = tf.reshape(emb_prev,[beam_size,embedding_size]) # emb_prev = embedding_ops.embedding_lookup(embedding, symbols) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def beam_rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None,output_projection=None, beam_size=10): """RNN decoder for the sequence-to-sequence model. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor with shape [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to the i-th output in order to generate the i+1-st input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. scope: VariableScope for the created subgraph; defaults to "rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing generated outputs. state: The state of each cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. (Note that in some cases, like basic RNN cell or GRU cell, outputs and states can be the same. They are different for LSTM cells though.) """ with variable_scope.variable_scope(scope or "rnn_decoder"): state = initial_state outputs = [] prev = None log_beam_probs, beam_path, beam_symbols = [],[],[] state_size = int(initial_state.get_shape().with_rank(2)[1]) for i, inp in enumerate(decoder_inputs): if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols) if i > 0: variable_scope.get_variable_scope().reuse_variables() input_size = inp.get_shape().with_rank(2)[1] print input_size x = inp output, state = cell(x, state) if loop_function is not None: prev = output if i ==0: states =[] for kk in range(beam_size): states.append(state) state = tf.reshape(tf.concat(0, states), [-1, state_size]) outputs.append(tf.argmax(nn_ops.xw_plus_b( output, output_projection[0], output_projection[1]), dimension=1)) return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size])
def loop_function(prev, _): if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = math_ops.argmax(prev, 1) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def loop_function(prev,_): if output_projection is not None: prev = nn_ops.xw_plus_b ( prev, output_projection[0], output_projection[1] ) tf_prev_symbol = batch_sample_with_temperature(prev) emb_prev = embedding_ops.embedding_lookup(embedding, tf_prev_symbol) if not update_embedding : emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def loop_function(prev, _): if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) tf_prev_symbol = batch_sample_with_temperature(prev) emb_prev = embedding_ops.embedding_lookup(embedding, tf_prev_symbol) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def to_check(logits, outputs, out_proj): softmax_outputs = [] argmax_outputs = [] for j, outs in enumerate(logits): #length_id softmax_outputs.append([]) argmax_outputs.append([]) for i in range(len(outs)): #batch_id projected_out = nn_ops.xw_plus_b(outs[i], \ out_proj[0], out_proj[1]) softmax_outputs[j].append(tf.nn.softmax(projected_out)) argmax_outputs[j].append(math_ops.argmax(projected_out, axis=1)) return softmax_outputs, argmax_outputs
def project_and_apply_input_bias(logits, output_projection, input_bias): if output_projection is not None: logits = nn_ops.xw_plus_b( logits, output_projection[0], output_projection[1]) # Apply softmax to ensure all tokens have a positive value. probs = tf.nn.softmax(logits) # Apply input bias, which is a mask of shape [batch, vocab len] # where each token from the input in addition to all "corrective" # tokens are set to 1.0. return tf.mul(probs, input_bias)
def loop_function(prev, _): if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) if isinstance(mc_search, bool): prev_symbol = tf.reshape(tf.multinomial(prev, 1), [-1]) if mc_search else math_ops.argmax(prev, 1) else: prev_symbol = tf.cond(mc_search, lambda: tf.reshape(tf.multinomial(prev, 1), [-1]), lambda: tf.argmax(prev, 1)) emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def fc(name, x, num_units_out): num_units_in = x.shape[1] weights_initializer = init_ops.truncated_normal_initializer(stddev=0.01) with vs.variable_scope(name): weights = _get_variable('weights', shape=[num_units_in, num_units_out], init=weights_initializer) biases = _get_variable('biases', shape=[num_units_out], init=init_ops.constant_initializer(0.0)) x = nn_ops.xw_plus_b(x, weights, biases) return x
def loop_function(prev, i, beam_symbols, beam_path, beam_log_probs): """Get a loop_function that extract the beam_sized previous symbols and embeds it. Args: prev: previous decoder output of shape [batch_size * beam_size, num_symbols] if i > 1 else [batch_size, num_symbols]. i: decoding step. beam_symbols: a (i-1)-length list of tensors in shape [batch_size, beam_size], which are symbols in the beam at each step. beam_path: a (i-1)-length list of tensors in shape [batch_size, beam_size], which are indices for previous symbols in the beam at each step. beam_log_probs: a (i-1)-length list of tensors in shape [batch_size * beam_size, 1], which are log probabilities in the beam at each step. """ if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) log_probs = tf.log(tf.nn.softmax(prev)) if i > 1: # broadcasting occurs in the add operation where beam_log_probs[-1] # is in shape [batch_size * beam_size, 1]. log_probs = tf.reshape(log_probs + beam_log_probs[-1], [-1, beam_size * num_symbols]) # Both returns are in shape [batch_size, beam_size]. best_log_probs, best_indices = tf.nn.top_k(log_probs, beam_size) # Reshape best_indices to shape [batch_size * beam_size]. best_indices = tf.stop_gradient( tf.squeeze(tf.reshape(best_indices, [-1, 1]))) # Reshape best_log_probs to shape [batch_size * beam_size, 1]. best_log_probs = tf.stop_gradient(tf.reshape(best_log_probs, [-1, 1])) symbols = best_indices % num_symbols parent_indices = best_indices // num_symbols beam_symbols.append(tf.reshape(symbols, [-1, beam_size])) beam_path.append(tf.reshape(parent_indices, [-1, beam_size])) beam_log_probs.append(best_log_probs) # emb_prev has shape [batch_size * beam_size, embedding_size]. emb_prev = embedding_ops.embedding_lookup(embedding, symbols) if not update_embedding_for_previous: emb_prev = tf.stop_gradient(emb_prev) return tf.reshape(emb_prev, [-1, embedding_size])
def _fc(_input, out_dim, name="fc", relu_flag=True, stddev=0.01, dtype=dtypes.float32): """ A wrapped full connection layer used in normal-fc or conv-fc(2D or 3D) :param _input: tensor, shape's ndim must be 2(normal) or 4(conv2d) or 5(conv3d) :param out_dim: scalar, outout dimension :param relu_flag: bool, whether using Relu after fc operation :param stddev: scalar, standard deviation used for params' initialization :param dtype: tf.dtypes, data type :return: tensor, shape = [_input.shape[0], out_dim] """ with variable_scope.variable_scope(name) as scope: input_shape = _input.get_shape() # print 'shape-----------', input_shape assert input_shape.ndims == 5 or input_shape.ndims == 4 or input_shape.ndims == 2 if input_shape.ndims == 2: feed_in, dim = (_input, input_shape[-1].value) else: input_shape = _input.get_shape() dim = 1 for dim_id in input_shape[1:].as_list(): dim *= dim_id feed_in = array_ops.reshape(_input, [-1, dim]) weights = variable_scope.get_variable( 'weights', shape=[dim, out_dim], initializer=tf.truncated_normal_initializer(stddev=stddev, dtype=dtype, seed=20170705)) biases = variable_scope.get_variable( 'biases', [out_dim], initializer=tf.constant_initializer(0., dtype=dtype)) act = nn_ops.xw_plus_b(feed_in, weights=weights, biases=biases, name=scope.name) if relu_flag: return nn_ops.relu(act) else: return act
def fully_connected(inp, inp_size, layer_size, name, activation=nn_ops.relu, dtype=dtypes.float32): """Helper method to create a fully connected hidden layer.""" wt = variable_scope.get_variable( name="{}_weight".format(name), shape=[inp_size, layer_size], dtype=dtype) bias = variable_scope.get_variable( name="{}_bias".format(name), shape=[layer_size], initializer=init_ops.zeros_initializer()) output = nn_ops.xw_plus_b(inp, wt, bias) if activation is not None: assert callable(activation) output = activation(output) return output
def loop_function(prev,encoder_inputs): if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1]) # print("encoder_inputs",encoder_inputs) # print("math_ops.argmax(prev, 1)", math_ops.argmax(prev, 1)) prev_symbol=[] ind = math_ops.argmax(prev, 1) # print(ind) prev_symbol=[] r=array_ops.transpose(encoder_inputs) for i in xrange(batch): ine=math_ops.to_int32(ind[i]) prev_symbol.append(r[i,ine]) emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def loop_function(prev, _): logit, attention_distribution = prev vocab_size = array_ops.shape(logit)[1] if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = extract_copy_augmented_argmax(logit, attention_distribution[0]) prev_symbol_dereferenced = dereference_copy_pointers( prev_symbol, encoder_inputs, vocab_size) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol_dereferenced) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def loop_function_with_sample(prev, _): if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) if is_sampling: prev_symbol_sample = tf.squeeze(tf.multinomial(prev*opt.L,1)) #B 1 multinomial(log odds) prev_symbol_sample = array_ops.stop_gradient(prev_symbol_sample) # important emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol_sample) else: if is_softargmax: prev_symbol_one_hot = tf.nn.log_softmax(prev*opt.L) #B V emb_prev = tf.matmul( tf.exp(prev_symbol_one_hot), embedding) # solve : Requires start <= limit when delta > 0 else: prev_symbol = math_ops.argmax(prev, 1) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) emb_prev = tf.concat([emb_prev,h], 1) if is_fed_h else emb_prev if not update_embedding: #just update projection? emb_prev = array_ops.stop_gradient(emb_prev) return (emb_prev, prev_symbol_sample) if is_sampling else emb_prev
def loop_function(prev, prev_probs, beam_size, _): if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) prev = math_ops.log(nn_ops.softmax(prev)) prev = nn_ops.bias_add(array_ops.transpose(prev), prev_probs) # num_symbols*BEAM_SIZE prev = array_ops.transpose(prev) prev = array_ops.expand_dims(array_ops.reshape(prev, [-1]), 0) # 1*(BEAM_SIZE*num_symbols) probs, prev_symbolb = nn_ops.top_k(prev, beam_size) probs = array_ops.squeeze(probs, [0]) # BEAM_SIZE, prev_symbolb = array_ops.squeeze(prev_symbolb, [0]) # BEAM_SIZE, index = prev_symbolb // num_symbols prev_symbol = prev_symbolb % num_symbols # Note that gradients will not propagate through the second parameter of embedding_lookup. emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev, probs, index, prev_symbol
def lstm_decoder(H, y, opt, prefix='', feed_previous=False, is_reuse=None): #y len* batch * [0,V] H batch * h #y = [tf.squeeze(y[:,i]) for i in xrange(y.get_shape()[1])] y = tf.unstack(y, axis=1) H0 = tf.squeeze(H) H1 = (H0, tf.zeros_like(H0)) # initialize H and C with tf.variable_scope(prefix + 'lstm_decoder', reuse=True): cell = tf.contrib.rnn.LSTMCell(opt.n_hid) with tf.variable_scope(prefix + 'lstm_decoder', reuse=is_reuse): weightInit = tf.random_uniform_initializer(-0.001, 0.001) W = tf.get_variable('W', [opt.n_hid, opt.n_words], initializer=weightInit) b = tf.get_variable('b', [opt.n_words], initializer=tf.random_uniform_initializer( -0.001, 0.001)) out_proj = (W, b) if feed_previous else None outputs, _ = embedding_rnn_decoder(decoder_inputs=y, initial_state=H1, cell=cell, feed_previous=feed_previous, output_projection=out_proj, num_symbols=opt.n_words, embedding_size=opt.embed_size) logits = [nn_ops.xw_plus_b(out, W, b) for out in outputs] syn_sents = [math_ops.argmax(l, 1) for l in logits] syn_sents = tf.stack(syn_sents, 1) #outputs, _ = embedding_rnn_decoder(decoder_inputs = y, initial_state = H, cell = tf.contrib.rnn.BasicLSTMCell, num_symbols = opt.n_words, embedding_size = opt.embed_size, scope = prefix + 'lstm_decoder') # outputs : batch * len loss = sequence_loss( logits[:-1], y[1:], [tf.cast(tf.ones_like(yy), tf.float32) for yy in y[1:]]) return loss, syn_sents, logits
def simple_loop_function(prev, _): '''Function that takes last output, and applies output projection to it''' if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) return prev
def __init__(self, feed_future_data, train, num_observation_steps, num_prediction_steps, batch_size, rnn_size, num_layers, learning_rate, learning_rate_decay_factor, input_size, max_gradient_norm): # feed_future_data: whether or not to feed the true data into the decoder instead of using a loopback # function. If false, a loopback function is used, feeding the last generated output as the next # decoder input. # train: train the model (or test) self.max_gradient_norm = max_gradient_norm self.rnn_size = rnn_size self.num_layers = num_layers dtype = tf.float32 self.batch_size = batch_size self.input_size = input_size self.observation_steps = num_observation_steps self.prediction_steps = num_prediction_steps self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) if feed_future_data and not train: print "Warning, feeding the model future sequence data (feed_forward) is not recommended when the model is not training." # The output of the multiRNN is the size of rnn_size, and it needs to match the input size, or loopback makes # no sense. Here a single layer without activation function is used, but it can be any number of # non RNN layers / functions w = tf.get_variable("proj_w", [self.rnn_size, self.input_size]) b = tf.get_variable("proj_b", [self.input_size]) output_projection = (w, b) # define layers here # input, linear RNN RNN linear etc # Default should be True, but TF 0.9 was throwing a warning, implying it was false single_cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size, state_is_tuple=True) cell = single_cell if self.num_layers > 1: # state_is_tuple defaults to False in TF0.9, and thus produces a warning.... cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * self.num_layers, state_is_tuple=True) def simple_loop_function(prev, _): '''Function that takes last output, and applies output projection to it''' if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) return prev # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, feed_forward): if not feed_forward: #feed last output as next input loopback_function = simple_loop_function else: loopback_function = None #feed correct input return basic_rnn_seq2seq_with_loop_function( encoder_inputs, decoder_inputs, cell, loop_function=loopback_function, dtype=dtype) # Feeds for inputs. self.observation_inputs = [] self.future_inputs = [] self.target_weights = [] self.target_inputs = [] for i in xrange( self.observation_steps): # Last bucket is the biggest one. self.observation_inputs.append( tf.placeholder(tf.float32, shape=[batch_size, self.input_size], name="encoder{0}".format(i))) for i in xrange(self.prediction_steps + 1): self.future_inputs.append( tf.placeholder(tf.float32, shape=[batch_size, self.input_size], name="decoder{0}".format(i))) for i in xrange(self.prediction_steps): self.target_weights.append( tf.placeholder(dtype, shape=[batch_size], name="weight{0}".format(i))) # Because the predictions are the future sequence inputs shifted by one and do not contain the GO symbol, some # array manipulation must occur #Pass observations directly to RNN encoder, no shifting neccessary self.encoder_inputs = self.observation_inputs targets = [ self.future_inputs[i + 1] #Skip first symbol (GO) for i in xrange(len(self.future_inputs) - 1) ] #remove last decoder input, but it is kept as the last target output self.decoder_inputs = [ self.future_inputs[i] for i in xrange(len(self.future_inputs) - 1) ] if train: #Training self.outputs, self.internal_states = seq2seq_f( self.encoder_inputs, self.decoder_inputs, feed_future_data) else: #Testing self.outputs, self.internal_states = seq2seq_f( self.encoder_inputs, self.decoder_inputs, feed_future_data) # self.outputs is a list of len(decoder_steps+1) containing [size batch x rnn_size] # The output projection below reduces this to: # a list of len(decoder_steps+1) containing [size batch x input_size] if output_projection is not None: self.outputs = [ nn_ops.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs ] def rmse(x, y): return tf.sqrt(tf.reduce_mean(tf.square(tf.sub(y, x)))) # TODO There are several types of cost functions to compare tracks. Implement many # Mainly, average MSE over the whole track, or just at a horizon time (t+10 or something) # There's this corner alg that Social LSTM refernces, but I haven't looked into it. self.losses = tf.nn.seq2seq.sequence_loss( self.outputs, targets, self.target_weights, softmax_loss_function=lambda x, y: rmse(x, y)) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if train: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.losses, params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, self.max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables()) tf.scalar_summary('Loss', self.losses)
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell, embedding, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False, output_projection=None, beam_size=10): if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError( "With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder", dtype=dtype) as scope: dtype = scope.dtype # batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = array_ops.shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable( "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append( nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append( variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = [] # 将encoder的最后一个隐层状态扩展成beam_size维,因为decoder阶段的batch_size是beam_size。 # initial_state是一个列表,RNN有多少层就有多少个元素,每个元素都是一个LSTMStateTuple,包含h,c两个隐层状态 # 所以要将其扩展成beam_size维,其实是把c和h进行扩展,最后再合成LSTMStateTuple就可以了 for layers in initial_state: c = [layers.c] * beam_size h = [layers.h] * beam_size c = tf.concat(c, 0) h = tf.concat(h, 0) state.append(rnn_cell_impl.LSTMStateTuple(c, h)) state = tuple(state) # state_size = int(initial_state.get_shape().with_rank(2)[1]) # states = [] # for kk in range(beam_size): # states.append(initial_state) # state = tf.concat(states, 0) # state = initial_state def attention(query): ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = Linear(query, attention_vec_size, True)(query) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None # attention也要定义成beam_size为的tensor batch_attn_size = array_ops.stack([beam_size, attn_size]) attns = [ array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads) ] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) log_beam_probs, beam_path, beam_symbols = [], [], [] for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if i == 0: #i=0时,输入时一个batch_szie=beam_size的tensor,且里面每个元素的值都是相同的,都是<GO>标志 inp = tf.nn.embedding_lookup( embedding, tf.constant(1, dtype=tf.int32, shape=[beam_size])) if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i, log_beam_probs, beam_path, beam_symbols) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) inputs = [inp] + attns x = Linear(inputs, input_size, True)(inputs) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): inputs = [cell_output] + attns output = Linear(inputs, output_size, True)(inputs) if loop_function is not None: prev = output outputs.append( tf.argmax(nn_ops.xw_plus_b(output, output_projection[0], output_projection[1]), axis=1)) return outputs, state, tf.reshape(tf.concat(beam_path, 0), [-1, beam_size]), tf.reshape( tf.concat(beam_symbols, 0), [-1, beam_size])
def embedding_attention_s2s(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, size, output_projection=None, feed_previous=False, use_lstm=False, local_p=False, dtype=tf.float32): """ Embedding with Manning, et. al. global attention model. """ reuse_f = lambda k: True if k > 0 else None S = len(encoder_inputs) with vs.variable_scope('embedding_attention_s2s') as outer_scope: with vs.variable_scope('encoder') as enc_scope: embedder_en, emb_en_input = _get_embedder( 'embedding_en', [num_encoder_symbols, size], encoder_inputs) encoder_states, prev_state = [], None for i, enc_in in enumerate(emb_en_input): with vs.variable_scope(enc_scope, reuse=reuse_f(i)): _, state = tf.nn.rnn(cell, [enc_in], initial_state=prev_state, dtype=dtype) prev_state = state encoder_states.append(state) h_s_bar = tf.transpose(tf.pack( [_get_hs(es, use_lstm) for es in encoder_states]), perm=[1, 0, 2]) with vs.variable_scope('decoder') as dec_scope: embedding_de, emb_de_input = _get_embedder( 'embedding_de', [num_decoder_symbols, size], decoder_inputs) W_a = _get_weights('W_a', [size, size]) W_c = _get_weights('W_c', [2 * size, size]) if local_p: W_p = _get_weights('W_p', [size, size]) v_p = _get_weights('v_p', [size, 1]) state = encoder_states[-1] partial_scores = [ tf.reshape(tf.matmul(_get_hs(es, use_lstm), W_a), [-1, 1, size]) for es in encoder_states ] output, outputs = None, [] for i, dec_in in enumerate(emb_de_input): with vs.variable_scope(dec_scope, reuse=reuse_f(i)): # Loop function at test time if feed_previous and output is not None: w, b = output_projection prev = nn_ops.xw_plus_b(output, w, b) prev_symbol = math_ops.argmax(prev, 1) dec_in = em_ops.embedding_lookup( embedding_de, prev_symbol) _, state = cell(dec_in, state) h_t = _get_hs(state, use_lstm) batch_h_t = tf.reshape(h_t, [-1, size, 1]) if local_p: align = tf.matmul(v_p, tf.tanh( tf.matmul(W_p, h_t, transpose_b=True)), transpose_a=True) p_t = (S - 1) * tf.sigmoid(align) scale = tf.exp(-4.5 * tf.square((p_t - tf.reshape( tf.cast(tf.range(S), dtype), [-1, 1])) / S)) #scale = tf.concat(0, [tf.exp(-4.5 * tf.square((p_t - s)/S)) for s in xrange(S)]) scores = tf.nn.softmax(tf.pack([ tf.reshape(tf.batch_matmul(ps, batch_h_t), [-1]) for ps in partial_scores ]), dim=0) if local_p: scores *= scale scores /= tf.reduce_sum(scores, 0) scores = tf.reshape(scores, [-1, 1, S]) c_t = tf.reshape(tf.batch_matmul(scores, h_s_bar), [-1, size]) h_bar_t = tf.concat(1, [c_t, _get_hs(state, use_lstm)]) output = tf.tanh(tf.matmul(h_bar_t, W_c)) outputs.append(output) return outputs, state
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False, output_projection=None, beam_size=10): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) print("Initial_state") state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) # for c in range(ct): ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = tf.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = [] attns.append(attention(initial_state)) tmp = tf.reshape(tf.concat(axis=0, values=attns), [-1, attn_size]) attns = [] attns.append(tmp) log_beam_probs, beam_path, beam_symbols = [],[],[] for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None : with variable_scope.variable_scope("loop_function", reuse=True): if prev is not None: inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols) input_size = inp.get_shape().with_rank(2)[1] x = linear([inp] + attns, input_size, True) cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output if i ==0: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) outputs.append(tf.argmax(nn_ops.xw_plus_b( output, output_projection[0], output_projection[1]), axis=1)) return outputs, state, tf.reshape(tf.concat(axis=0, values=beam_path),[-1,beam_size]), tf.reshape(tf.concat(axis=0, values=beam_symbols),[-1,beam_size])
def loop_function(prev, _): if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = math_ops.argmax(prev, 1) return _embed_decoder(prev_symbol, embedding, attention_states)
def lstm_decoder_embedding(H, y, W_emb, opt, prefix='', add_go=False, feed_previous=False, is_reuse=None, is_fed_h=True, is_sampling=False, is_softargmax=False, beam_width=None): biasInit = bias_init if add_go: y = tf.concat([tf.ones([opt.batch_size, 1], dtype=tf.int32), y], 1) y = tf.unstack(y, axis=1) # 1, . , . if hasattr(opt, 'global_feature') and opt.global_feature: H = layers.fully_connected(H, num_outputs=opt.n_hid, biases_initializer=biasInit, activation_fn=None, scope=prefix + 'lstm_decoder', reuse=is_reuse) H0 = tf.squeeze(H) H1 = (H0, tf.zeros_like(H0)) # tf.zeros_like(H0) # initialize C and H# y_input = [tf.concat([tf.nn.embedding_lookup(W_emb, features),H0],1) for features in y] if is_fed_h \ else [tf.nn.embedding_lookup(W_emb, features) for features in y] with tf.variable_scope(prefix + 'lstm_decoder', reuse=True): cell = tf.contrib.rnn.LSTMCell(opt.n_hid) with tf.variable_scope(prefix + 'lstm_decoder', reuse=is_reuse): weightInit = weight_init W = tf.get_variable('W', [opt.n_hid, opt.embed_size], initializer=weightInit) b = tf.get_variable('b', [opt.n_words], initializer=bias_init) W_new = tf.matmul(W, W_emb, transpose_b=True) # h* V out_proj = (W_new, b) if feed_previous else None decoder_res = rnn_decoder_custom_embedding(emb_inp=y_input, initial_state=H1, cell=cell, embedding=W_emb, opt=opt, feed_previous=feed_previous, output_projection=out_proj, num_symbols=opt.n_words, is_fed_h=is_fed_h, is_softargmax=is_softargmax, is_sampling=is_sampling) outputs = decoder_res[0] logits = [nn_ops.xw_plus_b(out, W_new, b) for out in outputs ] # hidden units to prob logits: out B*h W: h*E Wemb V*E if is_sampling: syn_sents = decoder_res[2] loss = sequence_loss( logits[:-1], syn_sents, [tf.cast(tf.ones_like(yy), tf.float32) for yy in syn_sents]) syn_sents = tf.stack(syn_sents, 1) else: syn_sents = [math_ops.argmax(l, 1) for l in logits[:-1]] syn_sents = tf.stack(syn_sents, 1) ones = tf.ones([opt.batch_size], dtype=tf.float32) mask = [ones, ones] + [ tf.cast(tf.not_equal(yy, dp.PAD_ID), tf.float32) for yy in y[1:-2] ] loss_all = sequence_loss_by_example(logits[:-1], y[1:], mask) loss = tf.reduce_mean(loss_all) return loss, syn_sents, logits, loss_all
def pad_output_function(output): return nn_ops.xw_plus_b(output, pad_output_projection[0], pad_output_projection[1], name="pad_output_projection")
def MDN_output_function(output): return nn_ops.xw_plus_b(output, MDN_output_projection[0], MDN_output_projection[1], name="MDN_output_projection")
def gru_decoder_embedding(H, y, W_emb, opt, prefix='', add_go=False, feed_previous=False, is_reuse=None, is_fed_h=True, is_sampling=False, is_softargmax=False, beam_width=None, res=None): #y len* batch * [0,V] H batch * h biasInit = tf.constant_initializer(0.001, dtype=tf.float32) #y = [tf.squeeze(y[:,i]) for i in xrange(y.get_shape()[1])] if add_go: y = tf.concat([tf.ones([opt.batch_size, 1], dtype=tf.int32), y], 1) y = tf.unstack(y, axis=1) # 1, . , . # make the size of hidden unit to be n_hid if not opt.additive_noise_lambda: H = layers.fully_connected(H, num_outputs=opt.n_hid, biases_initializer=biasInit, activation_fn=None, scope=prefix + 'gru_decoder', reuse=is_reuse) H0 = tf.squeeze(H) # H1 = (H0, tf.zeros_like(H0)) # initialize H and C # H1 = H0 y_input = [tf.concat([tf.nn.embedding_lookup(W_emb, features),H0],1) for features in y] if is_fed_h \ else [tf.nn.embedding_lookup(W_emb, features) for features in y] with tf.variable_scope(prefix + 'gru_decoder', reuse=True): cell = tf.contrib.rnn.GRUCell(opt.n_hid) # cell = tf.contrib.rnn.GRUCell(opt.maxlen) with tf.variable_scope(prefix + 'gru_decoder', reuse=is_reuse): weightInit = tf.random_uniform_initializer(-0.001, 0.001) W = tf.get_variable('W', [opt.n_hid, opt.embed_size], initializer=weightInit) b = tf.get_variable('b', [opt.vocab_size], initializer=tf.random_uniform_initializer( -0.001, 0.001)) W_new = tf.matmul(W, W_emb, transpose_b=True) # h* V out_proj = (W_new, b) if feed_previous else None decoder_res = rnn_decoder_custom_embedding_gru( emb_inp=y_input, initial_state=H1, cell=cell, embedding=W_emb, opt=opt, feed_previous=feed_previous, output_projection=out_proj, num_symbols=opt.vocab_size, is_fed_h=is_fed_h, is_softargmax=is_softargmax, is_sampling=is_sampling) outputs = decoder_res[0] if beam_width: #cell = rnn_cell.LSTMCell(cell_depth) #batch_size_tensor = constant_op.constant(opt.batch_size) initial_state = cell.zero_state( opt.batch_size * beam_width, tf.float32 ) #beam_search_decoder.tile_batch(H0, multiplier=beam_width) output_layer = layers_core.Dense(opt.vocab_size, use_bias=True, kernel_initializer=W_new, bias_initializer=b, activation=None) bsd = beam_search_decoder.BeamSearchDecoder( cell=cell, embedding=W_emb, start_tokens=array_ops.fill([opt.batch_size], dp.GO_ID), # go is 1 end_token=dp.EOS_ID, initial_state=initial_state, beam_width=beam_width, output_layer=output_layer, length_penalty_weight=0.0) #pdb.set_trace() final_outputs, final_state, final_sequence_lengths = ( decoder.dynamic_decode(bsd, output_time_major=False, maximum_iterations=opt.maxlen)) beam_search_decoder_output = final_outputs.beam_search_decoder_output #print beam_search_decoder_output.get_shape() logits = [nn_ops.xw_plus_b(out, W_new, b) for out in outputs ] # hidden units to prob logits: out B*h W: h*E Wemb V*E if is_sampling: syn_sents = decoder_res[2] loss = sequence_loss( logits[:-1], syn_sents, [tf.cast(tf.ones_like(yy), tf.float32) for yy in syn_sents]) #loss = sequence_loss(logits[:-1], syn_sents, [tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32) for yy in syn_sents]) #loss = sequence_loss(logits[:-1], syn_sents, [tf.concat([tf.ones([1]), tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32)],0) for yy in syn_sents[:-1]]) # use one more pad after EOS syn_sents = tf.stack(syn_sents, 1) else: syn_sents = [math_ops.argmax(l, 1) for l in logits] syn_sents = tf.stack(syn_sents, 1) loss = sequence_loss( logits[:-1], y[1:], [tf.cast(tf.ones_like(yy), tf.float32) for yy in y[1:]]) #loss = sequence_loss(logits[:-1], y[1:], [tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32) for yy in y[:-1]]) # use one more pad after EOS #outputs, _ = embedding_rnn_decoder(decoder_inputs = y, initial_state = H, cell = tf.contrib.rnn.BasicLSTMCell, num_symbols = opt.vocab_size, embedding_size = opt.embed_size, scope = prefix + 'lstm_decoder') # outputs : batch * len # save the res if res is not None: res['outputs'] = [tf.multiply(out, W) for out in outputs] return loss, syn_sents, logits
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False, output_projection=None, beam_size=10): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) print "Initial_state" state_size = int(initial_state.get_shape().with_rank(2)[1]) states =[] for kk in range(1): states.append(initial_state) state = tf.reshape(tf.concat(0, states), [-1, state_size]) def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) # for c in range(ct): ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = [] attns.append(attention(initial_state)) tmp = tf.reshape(tf.concat(0, attns), [-1, attn_size]) attns = [] attns.append(tmp) log_beam_probs, beam_path, beam_symbols = [],[],[] for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None : with variable_scope.variable_scope("loop_function", reuse=True): if prev is not None: inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols) input_size = inp.get_shape().with_rank(2)[1] x = linear([inp] + attns, input_size, True) cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output if i ==0: states =[] for kk in range(beam_size): states.append(state) state = tf.reshape(tf.concat(0, states), [-1, state_size]) with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) outputs.append(tf.argmax(nn_ops.xw_plus_b( output, output_projection[0], output_projection[1]), dimension=1)) return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size])
def argmax_loop_function(prev, _): if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = math_ops.argmax(prev, 1) return prev_symbol
def __call__(self, inputs, state, scope=None): """Run the input projection and then the cell.""" dtype = inputs.dtype memory = array_ops.identity(self.memory) # array_ops.ref_identity() # deep_copy(self.memory) with vs.variable_scope("memory_projection"): c_t, h_t = state v = math_ops.tanh(nn_ops.xw_plus_b(h_t, self.w, self.b)) if v.get_shape()[0] != self.batch_size: raise Exception("Beam Search Not supported now!") else: similarity = math_ops.matmul( array_ops.expand_dims(v, 1), # batch_size, 1 , embedding_size array_ops.transpose(memory, [0, 2, 1])) weight = nn_ops.softmax( array_ops.squeeze(similarity) # batch_size, topic_num ) weight_tile = gen_array_ops.tile(array_ops.expand_dims( weight, -1), [1, 1, self.embedding_size], name="weight") mt = math_ops.reduce_sum(memory * weight_tile, axis=1) # update memory if self.update_mem: gate = math_ops.matmul(memory, array_ops.expand_dims( inputs, axis=2)) # [batch_size, num, 1] gate = math_ops.sigmoid( gen_array_ops.squeeze(gate)) # batch_size x num inputs_expand = gen_array_ops.tile( array_ops.expand_dims(inputs, axis=1), [1, self.mem_num, 1]) # batch_size x num x embedding uu_tile = gen_array_ops.tile( array_ops.expand_dims(self.uu, axis=0), [self.batch_size, 1, 1 ]) # batch_size x embedding x embedding vv_tile = gen_array_ops.tile( array_ops.expand_dims(self.uv, axis=0), [self.batch_size, 1, 1 ]) # batch_size x embedding x embedding candidate = math_ops.add( math_ops.matmul(inputs_expand, uu_tile), math_ops.matmul(memory, vv_tile)) # batch_size x num x embedding # print(gate) gate_tile = gen_array_ops.tile(array_ops.expand_dims(gate, 2), [1, 1, self.embedding_size]) updated_mem = (1 - gate_tile) * memory + gate_tile * candidate self.memory = updated_mem with vs.variable_scope("attention_mechanism"): encoder_processed = self.memory_layer( self.encoder_outputs) # map to attention size # [batch_size, hidden_size] -> [batch_size, 1, attention_size] query_processed = array_ops.expand_dims(self.query_layer(c_t), 1) scores = math_ops.reduce_sum( self.attention_v * math_ops.tanh(encoder_processed + query_processed), [2]) alpha = nn_ops.softmax(scores, axis=1) output_hidden_size = self.encoder_outputs.shape[2].value alpha_tile = gen_array_ops.tile(array_ops.expand_dims(alpha, -1), [1, 1, output_hidden_size], name="weight") # print(weight_tile) # batch_size x num x embedding_size weighted_sum = math_ops.reduce_sum(self.encoder_outputs * alpha_tile, axis=1) return self._cell(tf.concat([inputs, weighted_sum, mt], axis=1), state)
def beam_rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None, output_projection=None, beam_size=1): """RNN decoder for the sequence-to-sequence model. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor with shape [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to the i-th output in order to generate the i+1-st input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. scope: VariableScope for the created subgraph; defaults to "rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing generated outputs. state: The state of each cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. (Note that in some cases, like basic RNN cell or GRU cell, outputs and states can be the same. They are different for LSTM cells though.) """ with variable_scope.variable_scope(scope or "rnn_decoder"): state = initial_state outputs = [] prev = None log_beam_probs, beam_path, beam_symbols = [], [], [] state_size = int(initial_state.get_shape().with_rank(2)[1]) for i, inp in enumerate(decoder_inputs): if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i, log_beam_probs, beam_path, beam_symbols) if i > 0: variable_scope.get_variable_scope().reuse_variables() input_size = inp.get_shape().with_rank(2)[1] x = inp output, state = cell(x, state) if loop_function is not None: prev = output if i == 0: states = [] for kk in range(beam_size): states.append(state) state = tf.reshape(tf.concat(0, states), [-1, state_size]) outputs.append( tf.argmax(nn_ops.xw_plus_b(output, output_projection[0], output_projection[1]), dimension=1)) return outputs, state, tf.reshape(tf.concat(0, beam_path), [-1, beam_size]), tf.reshape( tf.concat(0, beam_symbols), [-1, beam_size])
def loop_function(prev, _): if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) return prev
def loop_function(prev, out_proj, embedding): prev = nn_ops.xw_plus_b(prev, out_proj[0], out_proj[1]) prev_symbol = math_ops.argmax(prev, axis=1) emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) return [emb_prev, prev_symbol]
def _BuildAndTestMiniMNIST(self, param_index, tag): # Fix seed to avoid occasional flakiness np.random.seed(6) # Hyperparameters batch = 3 inputs = 16 features = 32 classes = 10 # Define the parameters inp_data = np.random.random_sample(inputs * batch) hidden_weight_data = np.random.randn( inputs * features) / np.sqrt(inputs) hidden_bias_data = np.random.random_sample(features) sm_weight_data = np.random.randn( features * classes) / np.sqrt(features) sm_bias_data = np.random.random_sample(classes) # special care for labels since they need to be normalized per batch label_data = np.random.random(batch * classes).reshape( (batch, classes)) s = label_data.sum(axis=1) label_data /= s[:, None] with self.session(use_gpu=True): # We treat the inputs as "parameters" here inp = constant_op.constant(inp_data.tolist(), shape=[batch, inputs], dtype=dtypes.float64, name="inp") hidden_weight = constant_op.constant(hidden_weight_data.tolist(), shape=[inputs, features], dtype=dtypes.float64, name="hidden_weight") hidden_bias = constant_op.constant(hidden_bias_data.tolist(), shape=[features], dtype=dtypes.float64, name="hidden_bias") softmax_weight = constant_op.constant(sm_weight_data.tolist(), shape=[features, classes], dtype=dtypes.float64, name="softmax_weight") softmax_bias = constant_op.constant(sm_bias_data.tolist(), shape=[classes], dtype=dtypes.float64, name="softmax_bias") # List all the parameter so that we can test them one at a time all_params = [ inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias ] param_sizes = [ [batch, inputs], # inp [inputs, features], # hidden_weight, [features], # hidden_bias [features, classes], # softmax_weight, [classes] ] # softmax_bias # Now, Building MNIST features = nn_ops.relu(nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features") logits = nn_ops.xw_plus_b(features, softmax_weight, softmax_bias, name="logits") labels = constant_op.constant(label_data.tolist(), shape=[batch, classes], dtype=dtypes.float64, name="labels") cost = nn_ops.softmax_cross_entropy_with_logits(labels=labels, logits=logits, name="cost") # Test the gradients. err = gradient_checker.compute_gradient_error( all_params[param_index], param_sizes[param_index], cost, [batch], delta=1e-5) tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err) return err
def _BuildAndTestMiniMNIST(self, param_index, tag): # Fix seed to avoid occasional flakiness np.random.seed(6) # Hyperparameters batch = 3 inputs = 16 features = 32 classes = 10 # Define the parameters inp_data = np.random.random_sample(inputs * batch) hidden_weight_data = np.random.randn(inputs * features) / np.sqrt(inputs) hidden_bias_data = np.random.random_sample(features) sm_weight_data = np.random.randn(features * classes) / np.sqrt(features) sm_bias_data = np.random.random_sample(classes) # special care for labels since they need to be normalized per batch label_data = np.random.random(batch * classes).reshape((batch, classes)) s = label_data.sum(axis=1) label_data /= s[:, None] with self.session(use_gpu=True): # We treat the inputs as "parameters" here inp = constant_op.constant( inp_data.tolist(), shape=[batch, inputs], dtype=dtypes.float64, name="inp") hidden_weight = constant_op.constant( hidden_weight_data.tolist(), shape=[inputs, features], dtype=dtypes.float64, name="hidden_weight") hidden_bias = constant_op.constant( hidden_bias_data.tolist(), shape=[features], dtype=dtypes.float64, name="hidden_bias") softmax_weight = constant_op.constant( sm_weight_data.tolist(), shape=[features, classes], dtype=dtypes.float64, name="softmax_weight") softmax_bias = constant_op.constant( sm_bias_data.tolist(), shape=[classes], dtype=dtypes.float64, name="softmax_bias") # List all the parameter so that we can test them one at a time all_params = [ inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias ] param_sizes = [ [batch, inputs], # inp [inputs, features], # hidden_weight, [features], # hidden_bias [features, classes], # softmax_weight, [classes] ] # softmax_bias # Now, Building MNIST features = nn_ops.relu( nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features") logits = nn_ops.xw_plus_b( features, softmax_weight, softmax_bias, name="logits") labels = constant_op.constant( label_data.tolist(), shape=[batch, classes], dtype=dtypes.float64, name="labels") cost = nn_ops.softmax_cross_entropy_with_logits( labels=labels, logits=logits, name="cost") # Test the gradients. err = gradient_checker.compute_gradient_error( all_params[param_index], param_sizes[param_index], cost, [batch], delta=1e-5) tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err) return err