def f(inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias):
   features = nn_ops.relu(
       nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features")
   logits = nn_ops.xw_plus_b(
       features, softmax_weight, softmax_bias, name="logits")
   labels = constant_op.constant(
       label_data.tolist(),
       shape=[batch, classes],
       dtype=dtypes.float64,
       name="labels")
   cost = nn_ops.softmax_cross_entropy_with_logits(
       labels=labels, logits=logits, name="cost")
   return cost
Example #2
0
def sequence_softmax(inputs, noutput, scope=None, name=None, linear_name=None):
  """Run a softmax layer over all the time steps of an input sequence.

  Args:
    inputs: (length, batch_size, depth) tensor
    noutput: output depth
    scope: optional scope name
    name: optional name for output tensor
    linear_name: name for linear (pre-softmax) output

  Returns:
    A tensor of size (length, batch_size, noutput).

  """
  length, _, ninputs = _shape(inputs)
  inputs_u = array_ops.unstack(inputs)
  output_u = []
  with variable_scope.variable_scope(scope, "SequenceSoftmax", [inputs]):
    initial_w = random_ops.truncated_normal([0 + ninputs, noutput], stddev=0.1)
    initial_b = constant_op.constant(0.1, shape=[noutput])
    w = variables.model_variable("weights", initializer=initial_w)
    b = variables.model_variable("biases", initializer=initial_b)
    for i in xrange(length):
      with variable_scope.variable_scope(scope, "SequenceSoftmaxStep",
                                         [inputs_u[i]]):
        # TODO(tmb) consider using slim.fully_connected(...,
        # activation_fn=tf.nn.softmax)
        linear = nn_ops.xw_plus_b(inputs_u[i], w, b, name=linear_name)
        output = nn_ops.softmax(linear)
        output_u += [output]
    outputs = array_ops.stack(output_u, name=name)
  return outputs
Example #3
0
 def extract_argmax_and_embed(prev, _):
   """Loop_function that extracts the symbol from prev and embeds it."""
   if output_projection is not None:
     prev = nn_ops.xw_plus_b(
         prev, output_projection[0], output_projection[1])
   prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1))
   return embedding_ops.embedding_lookup(embedding, prev_symbol)
  def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols):
    if output_projection is not None:
      prev = nn_ops.xw_plus_b(
          prev, output_projection[0], output_projection[1])
    # prev= prev.get_shape().with_rank(2)[1]

    probs  = tf.log(tf.nn.softmax(prev))

    if i > 1:

        probs = tf.reshape(probs + log_beam_probs[-1],
                               [-1, beam_size * num_symbols])

    best_probs, indices = tf.nn.top_k(probs, beam_size)
    indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1])))
    best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1]))

    symbols = indices % num_symbols # Which word in vocabulary.
    beam_parent = indices // num_symbols # Which hypothesis it came from.


    beam_symbols.append(symbols)
    beam_path.append(beam_parent)
    log_beam_probs.append(best_probs)

    # Note that gradients will not propagate through the second parameter of
    # embedding_lookup.

    emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
    emb_prev  = tf.reshape(emb_prev,[beam_size,embedding_size])
    # emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
    if not update_embedding:
      emb_prev = array_ops.stop_gradient(emb_prev)
    return emb_prev
def beam_rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None,
                scope=None,output_projection=None, beam_size=10):
  """RNN decoder for the sequence-to-sequence model.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    loop_function: If not None, this function will be applied to the i-th output
      in order to generate the i+1-st input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x output_size] containing generated outputs.
      state: The state of each cell at the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].
        (Note that in some cases, like basic RNN cell or GRU cell, outputs and
         states can be the same. They are different for LSTM cells though.)
  """
  with variable_scope.variable_scope(scope or "rnn_decoder"):
    state = initial_state
    outputs = []
    prev = None
    log_beam_probs, beam_path, beam_symbols = [],[],[]
    state_size = int(initial_state.get_shape().with_rank(2)[1])

    for i, inp in enumerate(decoder_inputs):
      if loop_function is not None and prev is not None:
        with variable_scope.variable_scope("loop_function", reuse=True):
          inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols)
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()

      input_size = inp.get_shape().with_rank(2)[1]
      print input_size
      x = inp
      output, state = cell(x, state)

      if loop_function is not None:
        prev = output
      if  i ==0:
          states =[]
          for kk in range(beam_size):
                states.append(state)
          state = tf.reshape(tf.concat(0, states), [-1, state_size])

      outputs.append(tf.argmax(nn_ops.xw_plus_b(
          output, output_projection[0], output_projection[1]), dimension=1))
  return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size])
Example #6
0
 def loop_function(prev, _):
     if output_projection is not None:
         prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1])
     prev_symbol = math_ops.argmax(prev, 1)
     # Note that gradients will not propagate through the second parameter of
     # embedding_lookup.
     emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
     if not update_embedding:
         emb_prev = array_ops.stop_gradient(emb_prev)
     return emb_prev
Example #7
0
 def loop_function(prev, _):
     if output_projection is not None:
         prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                 output_projection[1])
     prev_symbol = math_ops.argmax(prev, 1)
     # Note that gradients will not propagate through the second parameter of
     # embedding_lookup.
     emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
     if not update_embedding:
         emb_prev = array_ops.stop_gradient(emb_prev)
     return emb_prev
Example #8
0
   def loop_function(prev,_):

      if output_projection is not None:
         prev = nn_ops.xw_plus_b ( prev, output_projection[0], output_projection[1] )
      
      tf_prev_symbol = batch_sample_with_temperature(prev)
      emb_prev = embedding_ops.embedding_lookup(embedding, tf_prev_symbol)

      if not update_embedding :
         emb_prev = array_ops.stop_gradient(emb_prev)
      return emb_prev
Example #9
0
    def loop_function(prev, _):

        if output_projection is not None:
            prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                    output_projection[1])

        tf_prev_symbol = batch_sample_with_temperature(prev)
        emb_prev = embedding_ops.embedding_lookup(embedding, tf_prev_symbol)

        if not update_embedding:
            emb_prev = array_ops.stop_gradient(emb_prev)
        return emb_prev
Example #10
0
def to_check(logits, outputs, out_proj):
    softmax_outputs = []
    argmax_outputs = []
    for j, outs in enumerate(logits):  #length_id
        softmax_outputs.append([])
        argmax_outputs.append([])
        for i in range(len(outs)):  #batch_id
            projected_out = nn_ops.xw_plus_b(outs[i], \
                out_proj[0], out_proj[1])
            softmax_outputs[j].append(tf.nn.softmax(projected_out))
            argmax_outputs[j].append(math_ops.argmax(projected_out, axis=1))
    return softmax_outputs, argmax_outputs
Example #11
0
def project_and_apply_input_bias(logits, output_projection, input_bias):
    if output_projection is not None:
        logits = nn_ops.xw_plus_b(
            logits, output_projection[0], output_projection[1])

    # Apply softmax to ensure all tokens have a positive value.
    probs = tf.nn.softmax(logits)

    # Apply input bias, which is a mask of shape [batch, vocab len]
    # where each token from the input in addition to all "corrective"
    # tokens are set to 1.0.
    return tf.mul(probs, input_bias)
Example #12
0
    def loop_function(prev, _):
        if output_projection is not None:
            prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1])


        if isinstance(mc_search, bool):
            prev_symbol = tf.reshape(tf.multinomial(prev, 1), [-1]) if mc_search else math_ops.argmax(prev, 1)
        else:
            prev_symbol = tf.cond(mc_search, lambda: tf.reshape(tf.multinomial(prev, 1), [-1]), lambda: tf.argmax(prev, 1))


        emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
        if not update_embedding:
            emb_prev = array_ops.stop_gradient(emb_prev)
        return emb_prev
Example #13
0
def fc(name, x, num_units_out):
  num_units_in = x.shape[1]
  weights_initializer = init_ops.truncated_normal_initializer(stddev=0.01)

  with vs.variable_scope(name):
    weights = _get_variable('weights',
                            shape=[num_units_in, num_units_out],
                            init=weights_initializer)
    biases = _get_variable('biases',
                           shape=[num_units_out],
                           init=init_ops.constant_initializer(0.0))

    x = nn_ops.xw_plus_b(x, weights, biases)

  return x
Example #14
0
    def loop_function(prev, i, beam_symbols, beam_path, beam_log_probs):
        """Get a loop_function that extract the beam_sized previous symbols
        and embeds it.

        Args:
            prev: previous decoder output of shape [batch_size * beam_size, num_symbols]
                if i > 1 else [batch_size, num_symbols].
            i: decoding step.
            beam_symbols: a (i-1)-length list of tensors in shape [batch_size, beam_size],
                which are symbols in the beam at each step.
            beam_path: a (i-1)-length list of tensors in shape [batch_size, beam_size],
                which are indices for previous symbols in the beam at each step.
            beam_log_probs: a (i-1)-length list of tensors in shape [batch_size * beam_size, 1],
                which are log probabilities in the beam at each step.

        """
        if output_projection is not None:
            prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                    output_projection[1])

        log_probs = tf.log(tf.nn.softmax(prev))

        if i > 1:
            # broadcasting occurs in the add operation where beam_log_probs[-1]
            # is in shape [batch_size * beam_size, 1].
            log_probs = tf.reshape(log_probs + beam_log_probs[-1],
                                   [-1, beam_size * num_symbols])

        # Both returns are in shape [batch_size, beam_size].
        best_log_probs, best_indices = tf.nn.top_k(log_probs, beam_size)
        # Reshape best_indices to shape [batch_size * beam_size].
        best_indices = tf.stop_gradient(
            tf.squeeze(tf.reshape(best_indices, [-1, 1])))
        # Reshape best_log_probs to shape [batch_size * beam_size, 1].
        best_log_probs = tf.stop_gradient(tf.reshape(best_log_probs, [-1, 1]))

        symbols = best_indices % num_symbols
        parent_indices = best_indices // num_symbols

        beam_symbols.append(tf.reshape(symbols, [-1, beam_size]))
        beam_path.append(tf.reshape(parent_indices, [-1, beam_size]))
        beam_log_probs.append(best_log_probs)

        # emb_prev has shape [batch_size * beam_size, embedding_size].
        emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
        if not update_embedding_for_previous:
            emb_prev = tf.stop_gradient(emb_prev)
        return tf.reshape(emb_prev, [-1, embedding_size])
Example #15
0
def _fc(_input,
        out_dim,
        name="fc",
        relu_flag=True,
        stddev=0.01,
        dtype=dtypes.float32):
    """
    A wrapped full connection layer used in normal-fc or conv-fc(2D or 3D)
    :param _input: tensor, shape's ndim must be 2(normal) or 4(conv2d) or 5(conv3d)
    :param out_dim: scalar, outout dimension
    :param relu_flag: bool, whether using Relu after fc operation
    :param stddev: scalar, standard deviation used for params' initialization
    :param dtype: tf.dtypes, data type
    :return:
        tensor, shape = [_input.shape[0], out_dim]
    """
    with variable_scope.variable_scope(name) as scope:
        input_shape = _input.get_shape()
        # print 'shape-----------', input_shape
        assert input_shape.ndims == 5 or input_shape.ndims == 4 or input_shape.ndims == 2
        if input_shape.ndims == 2:
            feed_in, dim = (_input, input_shape[-1].value)
        else:
            input_shape = _input.get_shape()
            dim = 1
            for dim_id in input_shape[1:].as_list():
                dim *= dim_id
            feed_in = array_ops.reshape(_input, [-1, dim])
        weights = variable_scope.get_variable(
            'weights',
            shape=[dim, out_dim],
            initializer=tf.truncated_normal_initializer(stddev=stddev,
                                                        dtype=dtype,
                                                        seed=20170705))
        biases = variable_scope.get_variable(
            'biases', [out_dim],
            initializer=tf.constant_initializer(0., dtype=dtype))
        act = nn_ops.xw_plus_b(feed_in,
                               weights=weights,
                               biases=biases,
                               name=scope.name)
        if relu_flag:
            return nn_ops.relu(act)
        else:
            return act
Example #16
0
def fully_connected(inp,
                    inp_size,
                    layer_size,
                    name,
                    activation=nn_ops.relu,
                    dtype=dtypes.float32):
  """Helper method to create a fully connected hidden layer."""
  wt = variable_scope.get_variable(
      name="{}_weight".format(name), shape=[inp_size, layer_size], dtype=dtype)
  bias = variable_scope.get_variable(
      name="{}_bias".format(name),
      shape=[layer_size],
      initializer=init_ops.zeros_initializer())
  output = nn_ops.xw_plus_b(inp, wt, bias)
  if activation is not None:
    assert callable(activation)
    output = activation(output)
  return output
Example #17
0
  def loop_function(prev,encoder_inputs):
    if output_projection is not None:
      prev = nn_ops.xw_plus_b(
          prev, output_projection[0], output_projection[1])
    # print("encoder_inputs",encoder_inputs)
    # print("math_ops.argmax(prev, 1)", math_ops.argmax(prev, 1))
    prev_symbol=[]
    ind = math_ops.argmax(prev, 1)
    # print(ind)
    prev_symbol=[]
    r=array_ops.transpose(encoder_inputs)
    for i in xrange(batch):
      ine=math_ops.to_int32(ind[i])
      prev_symbol.append(r[i,ine])
 
    emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
    if not update_embedding:
      emb_prev = array_ops.stop_gradient(emb_prev)
    return emb_prev
Example #18
0
    def loop_function(prev, _):
        logit, attention_distribution = prev
        vocab_size = array_ops.shape(logit)[1]
        if output_projection is not None:
            prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                    output_projection[1])

        prev_symbol = extract_copy_augmented_argmax(logit,
                                                    attention_distribution[0])
        prev_symbol_dereferenced = dereference_copy_pointers(
            prev_symbol, encoder_inputs, vocab_size)

        # Note that gradients will not propagate through the second parameter of
        # embedding_lookup.
        emb_prev = embedding_ops.embedding_lookup(embedding,
                                                  prev_symbol_dereferenced)
        if not update_embedding:
            emb_prev = array_ops.stop_gradient(emb_prev)
        return emb_prev
Example #19
0
 def loop_function_with_sample(prev, _):
   if output_projection is not None:
     prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1])
   if is_sampling:
     prev_symbol_sample = tf.squeeze(tf.multinomial(prev*opt.L,1))  #B 1   multinomial(log odds)
     prev_symbol_sample = array_ops.stop_gradient(prev_symbol_sample) # important
     emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol_sample)
   else:
     if is_softargmax:
       prev_symbol_one_hot = tf.nn.log_softmax(prev*opt.L)  #B V
       emb_prev = tf.matmul( tf.exp(prev_symbol_one_hot), embedding) # solve : Requires start <= limit when delta > 0
     else:
       prev_symbol = math_ops.argmax(prev, 1)
       # Note that gradients will not propagate through the second parameter of
       # embedding_lookup.
       emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
   emb_prev = tf.concat([emb_prev,h], 1) if is_fed_h else emb_prev
   if not update_embedding: #just update projection?
     emb_prev = array_ops.stop_gradient(emb_prev)
   return (emb_prev, prev_symbol_sample) if is_sampling else emb_prev
Example #20
0
    def loop_function(prev, prev_probs, beam_size, _):
        if output_projection is not None:
            prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                    output_projection[1])

        prev = math_ops.log(nn_ops.softmax(prev))
        prev = nn_ops.bias_add(array_ops.transpose(prev),
                               prev_probs)  # num_symbols*BEAM_SIZE
        prev = array_ops.transpose(prev)
        prev = array_ops.expand_dims(array_ops.reshape(prev, [-1]),
                                     0)  # 1*(BEAM_SIZE*num_symbols)
        probs, prev_symbolb = nn_ops.top_k(prev, beam_size)
        probs = array_ops.squeeze(probs, [0])  # BEAM_SIZE,
        prev_symbolb = array_ops.squeeze(prev_symbolb, [0])  # BEAM_SIZE,
        index = prev_symbolb // num_symbols
        prev_symbol = prev_symbolb % num_symbols

        # Note that gradients will not propagate through the second parameter of embedding_lookup.
        emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
        if not update_embedding:
            emb_prev = array_ops.stop_gradient(emb_prev)
        return emb_prev, probs, index, prev_symbol
Example #21
0
def lstm_decoder(H, y, opt, prefix='', feed_previous=False, is_reuse=None):
    #y  len* batch * [0,V]   H batch * h

    #y = [tf.squeeze(y[:,i]) for i in xrange(y.get_shape()[1])]
    y = tf.unstack(y, axis=1)
    H0 = tf.squeeze(H)
    H1 = (H0, tf.zeros_like(H0))  # initialize H and C

    with tf.variable_scope(prefix + 'lstm_decoder', reuse=True):
        cell = tf.contrib.rnn.LSTMCell(opt.n_hid)
    with tf.variable_scope(prefix + 'lstm_decoder', reuse=is_reuse):
        weightInit = tf.random_uniform_initializer(-0.001, 0.001)
        W = tf.get_variable('W', [opt.n_hid, opt.n_words],
                            initializer=weightInit)
        b = tf.get_variable('b', [opt.n_words],
                            initializer=tf.random_uniform_initializer(
                                -0.001, 0.001))
        out_proj = (W, b) if feed_previous else None
        outputs, _ = embedding_rnn_decoder(decoder_inputs=y,
                                           initial_state=H1,
                                           cell=cell,
                                           feed_previous=feed_previous,
                                           output_projection=out_proj,
                                           num_symbols=opt.n_words,
                                           embedding_size=opt.embed_size)

    logits = [nn_ops.xw_plus_b(out, W, b) for out in outputs]
    syn_sents = [math_ops.argmax(l, 1) for l in logits]
    syn_sents = tf.stack(syn_sents, 1)

    #outputs, _ = embedding_rnn_decoder(decoder_inputs = y, initial_state = H, cell = tf.contrib.rnn.BasicLSTMCell, num_symbols = opt.n_words, embedding_size = opt.embed_size, scope = prefix + 'lstm_decoder')

    # outputs : batch * len

    loss = sequence_loss(
        logits[:-1], y[1:],
        [tf.cast(tf.ones_like(yy), tf.float32) for yy in y[1:]])

    return loss, syn_sents, logits
Example #22
0
 def simple_loop_function(prev, _):
     '''Function that takes last output, and applies output projection to it'''
     if output_projection is not None:
         prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                 output_projection[1])
     return prev
Example #23
0
    def __init__(self, feed_future_data, train, num_observation_steps,
                 num_prediction_steps, batch_size, rnn_size, num_layers,
                 learning_rate, learning_rate_decay_factor, input_size,
                 max_gradient_norm):
        # feed_future_data: whether or not to feed the true data into the decoder instead of using a loopback
        #                function. If false, a loopback function is used, feeding the last generated output as the next
        #                decoder input.
        # train: train the model (or test)

        self.max_gradient_norm = max_gradient_norm
        self.rnn_size = rnn_size
        self.num_layers = num_layers
        dtype = tf.float32

        self.batch_size = batch_size
        self.input_size = input_size
        self.observation_steps = num_observation_steps
        self.prediction_steps = num_prediction_steps
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)
        if feed_future_data and not train:
            print "Warning, feeding the model future sequence data (feed_forward) is not recommended when the model is not training."

        # The output of the multiRNN is the size of rnn_size, and it needs to match the input size, or loopback makes
        #  no sense. Here a single layer without activation function is used, but it can be any number of
        #  non RNN layers / functions
        w = tf.get_variable("proj_w", [self.rnn_size, self.input_size])
        b = tf.get_variable("proj_b", [self.input_size])
        output_projection = (w, b)

        # define layers here
        # input, linear RNN RNN linear etc

        # Default should be True, but TF 0.9 was throwing a warning, implying it was false
        single_cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size,
                                                   state_is_tuple=True)
        cell = single_cell
        if self.num_layers > 1:
            # state_is_tuple defaults to False in TF0.9, and thus produces a warning....
            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * self.num_layers,
                                               state_is_tuple=True)

        def simple_loop_function(prev, _):
            '''Function that takes last output, and applies output projection to it'''
            if output_projection is not None:
                prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                        output_projection[1])
            return prev

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, feed_forward):
            if not feed_forward:  #feed last output as next input
                loopback_function = simple_loop_function
            else:
                loopback_function = None  #feed correct input
            return basic_rnn_seq2seq_with_loop_function(
                encoder_inputs,
                decoder_inputs,
                cell,
                loop_function=loopback_function,
                dtype=dtype)

        # Feeds for inputs.
        self.observation_inputs = []
        self.future_inputs = []
        self.target_weights = []
        self.target_inputs = []
        for i in xrange(
                self.observation_steps):  # Last bucket is the biggest one.
            self.observation_inputs.append(
                tf.placeholder(tf.float32,
                               shape=[batch_size, self.input_size],
                               name="encoder{0}".format(i)))
        for i in xrange(self.prediction_steps + 1):
            self.future_inputs.append(
                tf.placeholder(tf.float32,
                               shape=[batch_size, self.input_size],
                               name="decoder{0}".format(i)))
        for i in xrange(self.prediction_steps):
            self.target_weights.append(
                tf.placeholder(dtype,
                               shape=[batch_size],
                               name="weight{0}".format(i)))

        # Because the predictions are the future sequence inputs shifted by one and do not contain the GO symbol, some
        # array manipulation must occur

        #Pass observations directly to RNN encoder, no shifting neccessary
        self.encoder_inputs = self.observation_inputs
        targets = [
            self.future_inputs[i + 1]  #Skip first symbol (GO)
            for i in xrange(len(self.future_inputs) - 1)
        ]
        #remove last decoder input, but it is kept as the last target output
        self.decoder_inputs = [
            self.future_inputs[i] for i in xrange(len(self.future_inputs) - 1)
        ]

        if train:  #Training
            self.outputs, self.internal_states = seq2seq_f(
                self.encoder_inputs, self.decoder_inputs, feed_future_data)
        else:  #Testing
            self.outputs, self.internal_states = seq2seq_f(
                self.encoder_inputs, self.decoder_inputs, feed_future_data)

        # self.outputs is a list of len(decoder_steps+1) containing [size batch x rnn_size]
        # The output projection below reduces this to:
        #                 a list of len(decoder_steps+1) containing [size batch x input_size]
        if output_projection is not None:
            self.outputs = [
                nn_ops.xw_plus_b(output, output_projection[0],
                                 output_projection[1])
                for output in self.outputs
            ]

        def rmse(x, y):
            return tf.sqrt(tf.reduce_mean(tf.square(tf.sub(y, x))))

        # TODO There are several types of cost functions to compare tracks. Implement many
        # Mainly, average MSE over the whole track, or just at a horizon time (t+10 or something)
        # There's this corner alg that Social LSTM refernces, but I haven't looked into it.

        self.losses = tf.nn.seq2seq.sequence_loss(
            self.outputs,
            targets,
            self.target_weights,
            softmax_loss_function=lambda x, y: rmse(x, y))

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if train:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            gradients = tf.gradients(self.losses, params)
            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, self.max_gradient_norm)

            self.gradient_norms.append(norm)
            self.updates.append(
                opt.apply_gradients(zip(clipped_gradients, params),
                                    global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())

        tf.scalar_summary('Loss', self.losses)
Example #24
0
def beam_attention_decoder(decoder_inputs,
                           initial_state,
                           attention_states,
                           cell,
                           embedding,
                           output_size=None,
                           num_heads=1,
                           loop_function=None,
                           dtype=None,
                           scope=None,
                           initial_state_attention=False,
                           output_projection=None,
                           beam_size=10):
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError(
            "With less than 1 heads, use a non-attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder",
                                       dtype=dtype) as scope:
        dtype = scope.dtype
        # batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        if attn_length is None:
            attn_length = array_ops.shape(attention_states)[1]
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])
        hidden_features = []
        v = []
        attention_vec_size = attn_size  # Size of query vectors for attention.
        for a in xrange(num_heads):
            k = variable_scope.get_variable(
                "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
            hidden_features.append(
                nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
            v.append(
                variable_scope.get_variable("AttnV_%d" % a,
                                            [attention_vec_size]))

        state = []
        # 将encoder的最后一个隐层状态扩展成beam_size维,因为decoder阶段的batch_size是beam_size。
        # initial_state是一个列表,RNN有多少层就有多少个元素,每个元素都是一个LSTMStateTuple,包含h,c两个隐层状态
        # 所以要将其扩展成beam_size维,其实是把c和h进行扩展,最后再合成LSTMStateTuple就可以了
        for layers in initial_state:
            c = [layers.c] * beam_size
            h = [layers.h] * beam_size
            c = tf.concat(c, 0)
            h = tf.concat(h, 0)
            state.append(rnn_cell_impl.LSTMStateTuple(c, h))
        state = tuple(state)

        # state_size = int(initial_state.get_shape().with_rank(2)[1])
        # states = []
        # for kk in range(beam_size):
        #     states.append(initial_state)
        # state = tf.concat(states, 0)
        # state = initial_state

        def attention(query):
            ds = []  # Results of attention reads will be stored here.
            if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                query_list = nest.flatten(query)
                for q in query_list:  # Check that ndims == 2 if specified.
                    ndims = q.get_shape().ndims
                    if ndims:
                        assert ndims == 2
                query = array_ops.concat(query_list, 1)
            for a in xrange(num_heads):
                with variable_scope.variable_scope("Attention_%d" % a):
                    y = Linear(query, attention_vec_size, True)(query)
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    s = math_ops.reduce_sum(
                        v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                    a = nn_ops.softmax(s)
                    # Now calculate the attention-weighted vector d.
                    d = math_ops.reduce_sum(
                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds

        outputs = []
        prev = None
        # attention也要定义成beam_size为的tensor
        batch_attn_size = array_ops.stack([beam_size, attn_size])
        attns = [
            array_ops.zeros(batch_attn_size, dtype=dtype)
            for _ in xrange(num_heads)
        ]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])
        if initial_state_attention:
            attns = attention(initial_state)

        log_beam_probs, beam_path, beam_symbols = [], [], []
        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if i == 0:
                #i=0时,输入时一个batch_szie=beam_size的tensor,且里面每个元素的值都是相同的,都是<GO>标志
                inp = tf.nn.embedding_lookup(
                    embedding, tf.constant(1,
                                           dtype=tf.int32,
                                           shape=[beam_size]))

            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp = loop_function(prev, i, log_beam_probs, beam_path,
                                        beam_symbols)
            # Merge input and previous attentions into one vector of the right size.
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" %
                                 inp.name)
            inputs = [inp] + attns
            x = Linear(inputs, input_size, True)(inputs)

            # Run the RNN.
            cell_output, state = cell(x, state)
            # Run the attention mechanism.
            if i == 0 and initial_state_attention:
                with variable_scope.variable_scope(
                        variable_scope.get_variable_scope(), reuse=True):
                    attns = attention(state)
            else:
                attns = attention(state)

            with variable_scope.variable_scope("AttnOutputProjection"):
                inputs = [cell_output] + attns
                output = Linear(inputs, output_size, True)(inputs)
            if loop_function is not None:
                prev = output
            outputs.append(
                tf.argmax(nn_ops.xw_plus_b(output, output_projection[0],
                                           output_projection[1]),
                          axis=1))

    return outputs, state, tf.reshape(tf.concat(beam_path, 0),
                                      [-1, beam_size]), tf.reshape(
                                          tf.concat(beam_symbols, 0),
                                          [-1, beam_size])
Example #25
0
def embedding_attention_s2s(encoder_inputs,
                            decoder_inputs,
                            cell,
                            num_encoder_symbols,
                            num_decoder_symbols,
                            size,
                            output_projection=None,
                            feed_previous=False,
                            use_lstm=False,
                            local_p=False,
                            dtype=tf.float32):
    """
    Embedding with Manning, et. al. global attention model.
    """
    reuse_f = lambda k: True if k > 0 else None
    S = len(encoder_inputs)

    with vs.variable_scope('embedding_attention_s2s') as outer_scope:
        with vs.variable_scope('encoder') as enc_scope:
            embedder_en, emb_en_input = _get_embedder(
                'embedding_en', [num_encoder_symbols, size], encoder_inputs)

            encoder_states, prev_state = [], None
            for i, enc_in in enumerate(emb_en_input):
                with vs.variable_scope(enc_scope, reuse=reuse_f(i)):
                    _, state = tf.nn.rnn(cell, [enc_in],
                                         initial_state=prev_state,
                                         dtype=dtype)
                    prev_state = state
                    encoder_states.append(state)

            h_s_bar = tf.transpose(tf.pack(
                [_get_hs(es, use_lstm) for es in encoder_states]),
                                   perm=[1, 0, 2])

        with vs.variable_scope('decoder') as dec_scope:
            embedding_de, emb_de_input = _get_embedder(
                'embedding_de', [num_decoder_symbols, size], decoder_inputs)

            W_a = _get_weights('W_a', [size, size])
            W_c = _get_weights('W_c', [2 * size, size])

            if local_p:
                W_p = _get_weights('W_p', [size, size])
                v_p = _get_weights('v_p', [size, 1])

            state = encoder_states[-1]
            partial_scores = [
                tf.reshape(tf.matmul(_get_hs(es, use_lstm), W_a),
                           [-1, 1, size]) for es in encoder_states
            ]

            output, outputs = None, []

            for i, dec_in in enumerate(emb_de_input):
                with vs.variable_scope(dec_scope, reuse=reuse_f(i)):

                    # Loop function at test time
                    if feed_previous and output is not None:
                        w, b = output_projection
                        prev = nn_ops.xw_plus_b(output, w, b)
                        prev_symbol = math_ops.argmax(prev, 1)
                        dec_in = em_ops.embedding_lookup(
                            embedding_de, prev_symbol)

                    _, state = cell(dec_in, state)
                    h_t = _get_hs(state, use_lstm)
                    batch_h_t = tf.reshape(h_t, [-1, size, 1])

                    if local_p:
                        align = tf.matmul(v_p,
                                          tf.tanh(
                                              tf.matmul(W_p,
                                                        h_t,
                                                        transpose_b=True)),
                                          transpose_a=True)
                        p_t = (S - 1) * tf.sigmoid(align)
                        scale = tf.exp(-4.5 * tf.square((p_t - tf.reshape(
                            tf.cast(tf.range(S), dtype), [-1, 1])) / S))
                        #scale = tf.concat(0, [tf.exp(-4.5 * tf.square((p_t - s)/S)) for s in xrange(S)])

                    scores = tf.nn.softmax(tf.pack([
                        tf.reshape(tf.batch_matmul(ps, batch_h_t), [-1])
                        for ps in partial_scores
                    ]),
                                           dim=0)

                    if local_p:
                        scores *= scale
                        scores /= tf.reduce_sum(scores, 0)

                    scores = tf.reshape(scores, [-1, 1, S])

                    c_t = tf.reshape(tf.batch_matmul(scores, h_s_bar),
                                     [-1, size])
                    h_bar_t = tf.concat(1, [c_t, _get_hs(state, use_lstm)])
                    output = tf.tanh(tf.matmul(h_bar_t, W_c))
                    outputs.append(output)

        return outputs, state
Example #26
0
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=dtypes.float32, scope=None,
                      initial_state_attention=False, output_projection=None, beam_size=10):
  """RNN decoder with attention for the sequence-to-sequence model.

  In this context "attention" means that, during decoding, the RNN can look up
  information in the additional tensor attention_states, and it does this by
  focusing on a few entries from the tensor. This model has proven to yield
  especially good results in a number of sequence-to-sequence tasks. This
  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
  details). It is recommended for complex sequence-to-sequence tasks.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: Size of the output vectors; if None, we use cell.output_size.
    num_heads: Number of attention heads that read from attention_states.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either the i-th element
        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        First, we run the cell on a combination of the input and previous
        attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      state: The state of each decoder cell the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, shapes
      of attention_states are not set, or input size cannot be inferred
      from the input.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with variable_scope.variable_scope(scope or "attention_decoder"):
    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_%d" % a,
                                      [1, 1, attn_size, attention_vec_size])
      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(variable_scope.get_variable("AttnV_%d" % a,
                                           [attention_vec_size]))

    print("Initial_state")

    state = initial_state
    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_%d" % a):
          y = linear(query, attention_vec_size, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          # for c in range(ct):
          ds.append(array_ops.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = tf.stack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])

    if initial_state_attention:
       attns = []
       attns.append(attention(initial_state))
       tmp = tf.reshape(tf.concat(axis=0, values=attns), [-1, attn_size])
       attns = []
       attns.append(tmp)

    log_beam_probs, beam_path, beam_symbols = [],[],[]
    for i, inp in enumerate(decoder_inputs):

      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None :
        with variable_scope.variable_scope("loop_function", reuse=True):
            if prev is not None:
                inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols)

      input_size = inp.get_shape().with_rank(2)[1]
      x = linear([inp] + attns, input_size, True)
      cell_output, state = cell(x, state)

      # Run the attention mechanism.
      if i == 0 and initial_state_attention:
        with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                           reuse=True):
          attns = attention(state)
      else:
          attns = attention(state)

      with variable_scope.variable_scope("AttnOutputProjection"):
        output = linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        prev = output
      if  i ==0:
          with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True):
                attns = attention(state)

      outputs.append(tf.argmax(nn_ops.xw_plus_b(
          output, output_projection[0], output_projection[1]), axis=1))

  return outputs, state, tf.reshape(tf.concat(axis=0, values=beam_path),[-1,beam_size]), tf.reshape(tf.concat(axis=0, values=beam_symbols),[-1,beam_size])
Example #27
0
 def loop_function(prev, _):
     if output_projection is not None:
         prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                 output_projection[1])
     prev_symbol = math_ops.argmax(prev, 1)
     return _embed_decoder(prev_symbol, embedding, attention_states)
Example #28
0
def lstm_decoder_embedding(H,
                           y,
                           W_emb,
                           opt,
                           prefix='',
                           add_go=False,
                           feed_previous=False,
                           is_reuse=None,
                           is_fed_h=True,
                           is_sampling=False,
                           is_softargmax=False,
                           beam_width=None):
    biasInit = bias_init

    if add_go:
        y = tf.concat([tf.ones([opt.batch_size, 1], dtype=tf.int32), y], 1)

    y = tf.unstack(y, axis=1)  # 1, . , .

    if hasattr(opt, 'global_feature') and opt.global_feature:
        H = layers.fully_connected(H,
                                   num_outputs=opt.n_hid,
                                   biases_initializer=biasInit,
                                   activation_fn=None,
                                   scope=prefix + 'lstm_decoder',
                                   reuse=is_reuse)
    H0 = tf.squeeze(H)
    H1 = (H0, tf.zeros_like(H0))  # tf.zeros_like(H0) # initialize C and H#

    y_input = [tf.concat([tf.nn.embedding_lookup(W_emb, features),H0],1) for features in y] if is_fed_h   \
               else [tf.nn.embedding_lookup(W_emb, features) for features in y]
    with tf.variable_scope(prefix + 'lstm_decoder', reuse=True):
        cell = tf.contrib.rnn.LSTMCell(opt.n_hid)
    with tf.variable_scope(prefix + 'lstm_decoder', reuse=is_reuse):
        weightInit = weight_init
        W = tf.get_variable('W', [opt.n_hid, opt.embed_size],
                            initializer=weightInit)
        b = tf.get_variable('b', [opt.n_words], initializer=bias_init)
        W_new = tf.matmul(W, W_emb, transpose_b=True)  # h* V

        out_proj = (W_new, b) if feed_previous else None
        decoder_res = rnn_decoder_custom_embedding(emb_inp=y_input,
                                                   initial_state=H1,
                                                   cell=cell,
                                                   embedding=W_emb,
                                                   opt=opt,
                                                   feed_previous=feed_previous,
                                                   output_projection=out_proj,
                                                   num_symbols=opt.n_words,
                                                   is_fed_h=is_fed_h,
                                                   is_softargmax=is_softargmax,
                                                   is_sampling=is_sampling)
        outputs = decoder_res[0]

    logits = [nn_ops.xw_plus_b(out, W_new, b) for out in outputs
              ]  # hidden units to prob logits: out B*h  W: h*E  Wemb V*E
    if is_sampling:
        syn_sents = decoder_res[2]
        loss = sequence_loss(
            logits[:-1], syn_sents,
            [tf.cast(tf.ones_like(yy), tf.float32) for yy in syn_sents])
        syn_sents = tf.stack(syn_sents, 1)
    else:
        syn_sents = [math_ops.argmax(l, 1) for l in logits[:-1]]
        syn_sents = tf.stack(syn_sents, 1)
        ones = tf.ones([opt.batch_size], dtype=tf.float32)
        mask = [ones, ones] + [
            tf.cast(tf.not_equal(yy, dp.PAD_ID), tf.float32) for yy in y[1:-2]
        ]
        loss_all = sequence_loss_by_example(logits[:-1], y[1:], mask)
        loss = tf.reduce_mean(loss_all)

    return loss, syn_sents, logits, loss_all
Example #29
0
 def pad_output_function(output):
     return nn_ops.xw_plus_b(output,
                             pad_output_projection[0],
                             pad_output_projection[1],
                             name="pad_output_projection")
Example #30
0
 def MDN_output_function(output):
     return nn_ops.xw_plus_b(output,
                             MDN_output_projection[0],
                             MDN_output_projection[1],
                             name="MDN_output_projection")
Example #31
0
def gru_decoder_embedding(H,
                          y,
                          W_emb,
                          opt,
                          prefix='',
                          add_go=False,
                          feed_previous=False,
                          is_reuse=None,
                          is_fed_h=True,
                          is_sampling=False,
                          is_softargmax=False,
                          beam_width=None,
                          res=None):
    #y  len* batch * [0,V]   H batch * h
    biasInit = tf.constant_initializer(0.001, dtype=tf.float32)
    #y = [tf.squeeze(y[:,i]) for i in xrange(y.get_shape()[1])]
    if add_go:
        y = tf.concat([tf.ones([opt.batch_size, 1], dtype=tf.int32), y], 1)

    y = tf.unstack(y, axis=1)  # 1, . , .
    # make the size of hidden unit to be n_hid
    if not opt.additive_noise_lambda:
        H = layers.fully_connected(H,
                                   num_outputs=opt.n_hid,
                                   biases_initializer=biasInit,
                                   activation_fn=None,
                                   scope=prefix + 'gru_decoder',
                                   reuse=is_reuse)
    H0 = tf.squeeze(H)
    # H1 = (H0, tf.zeros_like(H0))  # initialize H and C #
    H1 = H0

    y_input = [tf.concat([tf.nn.embedding_lookup(W_emb, features),H0],1) for features in y] if is_fed_h   \
               else [tf.nn.embedding_lookup(W_emb, features) for features in y]
    with tf.variable_scope(prefix + 'gru_decoder', reuse=True):
        cell = tf.contrib.rnn.GRUCell(opt.n_hid)
        # cell = tf.contrib.rnn.GRUCell(opt.maxlen)
    with tf.variable_scope(prefix + 'gru_decoder', reuse=is_reuse):
        weightInit = tf.random_uniform_initializer(-0.001, 0.001)
        W = tf.get_variable('W', [opt.n_hid, opt.embed_size],
                            initializer=weightInit)
        b = tf.get_variable('b', [opt.vocab_size],
                            initializer=tf.random_uniform_initializer(
                                -0.001, 0.001))
        W_new = tf.matmul(W, W_emb, transpose_b=True)  # h* V

        out_proj = (W_new, b) if feed_previous else None
        decoder_res = rnn_decoder_custom_embedding_gru(
            emb_inp=y_input,
            initial_state=H1,
            cell=cell,
            embedding=W_emb,
            opt=opt,
            feed_previous=feed_previous,
            output_projection=out_proj,
            num_symbols=opt.vocab_size,
            is_fed_h=is_fed_h,
            is_softargmax=is_softargmax,
            is_sampling=is_sampling)
        outputs = decoder_res[0]

        if beam_width:
            #cell = rnn_cell.LSTMCell(cell_depth)
            #batch_size_tensor = constant_op.constant(opt.batch_size)
            initial_state = cell.zero_state(
                opt.batch_size * beam_width, tf.float32
            )  #beam_search_decoder.tile_batch(H0, multiplier=beam_width)
            output_layer = layers_core.Dense(opt.vocab_size,
                                             use_bias=True,
                                             kernel_initializer=W_new,
                                             bias_initializer=b,
                                             activation=None)
            bsd = beam_search_decoder.BeamSearchDecoder(
                cell=cell,
                embedding=W_emb,
                start_tokens=array_ops.fill([opt.batch_size],
                                            dp.GO_ID),  # go is 1
                end_token=dp.EOS_ID,
                initial_state=initial_state,
                beam_width=beam_width,
                output_layer=output_layer,
                length_penalty_weight=0.0)
            #pdb.set_trace()
            final_outputs, final_state, final_sequence_lengths = (
                decoder.dynamic_decode(bsd,
                                       output_time_major=False,
                                       maximum_iterations=opt.maxlen))
            beam_search_decoder_output = final_outputs.beam_search_decoder_output
            #print beam_search_decoder_output.get_shape()

    logits = [nn_ops.xw_plus_b(out, W_new, b) for out in outputs
              ]  # hidden units to prob logits: out B*h  W: h*E  Wemb V*E
    if is_sampling:
        syn_sents = decoder_res[2]
        loss = sequence_loss(
            logits[:-1], syn_sents,
            [tf.cast(tf.ones_like(yy), tf.float32) for yy in syn_sents])
        #loss = sequence_loss(logits[:-1], syn_sents, [tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32) for yy in syn_sents])
        #loss = sequence_loss(logits[:-1], syn_sents, [tf.concat([tf.ones([1]), tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32)],0) for yy in syn_sents[:-1]]) # use one more pad after EOS
        syn_sents = tf.stack(syn_sents, 1)
    else:
        syn_sents = [math_ops.argmax(l, 1) for l in logits]
        syn_sents = tf.stack(syn_sents, 1)
        loss = sequence_loss(
            logits[:-1], y[1:],
            [tf.cast(tf.ones_like(yy), tf.float32) for yy in y[1:]])
        #loss = sequence_loss(logits[:-1], y[1:], [tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32) for yy in y[:-1]]) # use one more pad after EOS

    #outputs, _ = embedding_rnn_decoder(decoder_inputs = y, initial_state = H, cell = tf.contrib.rnn.BasicLSTMCell, num_symbols = opt.vocab_size, embedding_size = opt.embed_size, scope = prefix + 'lstm_decoder')

    # outputs : batch * len

    # save the res
    if res is not None:
        res['outputs'] = [tf.multiply(out, W) for out in outputs]

    return loss, syn_sents, logits
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=dtypes.float32, scope=None,
                      initial_state_attention=False, output_projection=None, beam_size=10):
  """RNN decoder with attention for the sequence-to-sequence model.

  In this context "attention" means that, during decoding, the RNN can look up
  information in the additional tensor attention_states, and it does this by
  focusing on a few entries from the tensor. This model has proven to yield
  especially good results in a number of sequence-to-sequence tasks. This
  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
  details). It is recommended for complex sequence-to-sequence tasks.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: Size of the output vectors; if None, we use cell.output_size.
    num_heads: Number of attention heads that read from attention_states.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either the i-th element
        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        First, we run the cell on a combination of the input and previous
        attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      state: The state of each decoder cell the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, shapes
      of attention_states are not set, or input size cannot be inferred
      from the input.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with variable_scope.variable_scope(scope or "attention_decoder"):
    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_%d" % a,
                                      [1, 1, attn_size, attention_vec_size])
      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(variable_scope.get_variable("AttnV_%d" % a,
                                           [attention_vec_size]))

    print "Initial_state"

    state_size =  int(initial_state.get_shape().with_rank(2)[1])
    states =[]
    for kk in range(1):
        states.append(initial_state)
    state = tf.reshape(tf.concat(0, states), [-1, state_size])
    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_%d" % a):
          y = linear(query, attention_vec_size, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          # for c in range(ct):
          ds.append(array_ops.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = array_ops.pack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])

    if initial_state_attention:
       attns = []
       attns.append(attention(initial_state))
       tmp = tf.reshape(tf.concat(0, attns), [-1, attn_size])
       attns = []
       attns.append(tmp)

    log_beam_probs, beam_path, beam_symbols = [],[],[]
    for i, inp in enumerate(decoder_inputs):

      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None :
        with variable_scope.variable_scope("loop_function", reuse=True):
            if prev is not None:
                inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols)

      input_size = inp.get_shape().with_rank(2)[1]
      x = linear([inp] + attns, input_size, True)
      cell_output, state = cell(x, state)

      # Run the attention mechanism.
      if i == 0 and initial_state_attention:
        with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                           reuse=True):
          attns = attention(state)
      else:
          attns = attention(state)

      with variable_scope.variable_scope("AttnOutputProjection"):
        output = linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        prev = output
      if  i ==0:
          states =[]
          for kk in range(beam_size):
                states.append(state)
          state = tf.reshape(tf.concat(0, states), [-1, state_size])
          with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True):
                attns = attention(state)

      outputs.append(tf.argmax(nn_ops.xw_plus_b(
          output, output_projection[0], output_projection[1]), dimension=1))

  return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size])
Example #33
0
 def argmax_loop_function(prev, _):
     if output_projection is not None:
         prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                 output_projection[1])
     prev_symbol = math_ops.argmax(prev, 1)
     return prev_symbol
Example #34
0
    def __call__(self, inputs, state, scope=None):
        """Run the input projection and then the cell."""
        dtype = inputs.dtype
        memory = array_ops.identity(self.memory)

        # array_ops.ref_identity()
        # deep_copy(self.memory)
        with vs.variable_scope("memory_projection"):
            c_t, h_t = state

            v = math_ops.tanh(nn_ops.xw_plus_b(h_t, self.w, self.b))
            if v.get_shape()[0] != self.batch_size:
                raise Exception("Beam Search Not supported now!")
            else:
                similarity = math_ops.matmul(
                    array_ops.expand_dims(v,
                                          1),  # batch_size, 1 , embedding_size
                    array_ops.transpose(memory, [0, 2, 1]))

                weight = nn_ops.softmax(
                    array_ops.squeeze(similarity)  # batch_size, topic_num
                )
                weight_tile = gen_array_ops.tile(array_ops.expand_dims(
                    weight, -1), [1, 1, self.embedding_size],
                                                 name="weight")
                mt = math_ops.reduce_sum(memory * weight_tile, axis=1)

            # update memory
            if self.update_mem:
                gate = math_ops.matmul(memory,
                                       array_ops.expand_dims(
                                           inputs,
                                           axis=2))  # [batch_size, num, 1]
                gate = math_ops.sigmoid(
                    gen_array_ops.squeeze(gate))  # batch_size x num

                inputs_expand = gen_array_ops.tile(
                    array_ops.expand_dims(inputs, axis=1),
                    [1, self.mem_num, 1])  # batch_size x num x embedding

                uu_tile = gen_array_ops.tile(
                    array_ops.expand_dims(self.uu, axis=0),
                    [self.batch_size, 1, 1
                     ])  # batch_size x embedding x embedding

                vv_tile = gen_array_ops.tile(
                    array_ops.expand_dims(self.uv, axis=0),
                    [self.batch_size, 1, 1
                     ])  # batch_size x embedding x embedding

                candidate = math_ops.add(
                    math_ops.matmul(inputs_expand, uu_tile),
                    math_ops.matmul(memory,
                                    vv_tile))  # batch_size x num x embedding
                # print(gate)
                gate_tile = gen_array_ops.tile(array_ops.expand_dims(gate, 2),
                                               [1, 1, self.embedding_size])
                updated_mem = (1 - gate_tile) * memory + gate_tile * candidate
                self.memory = updated_mem

        with vs.variable_scope("attention_mechanism"):

            encoder_processed = self.memory_layer(
                self.encoder_outputs)  # map to attention size
            # [batch_size,  hidden_size] -> [batch_size, 1, attention_size]
            query_processed = array_ops.expand_dims(self.query_layer(c_t), 1)

            scores = math_ops.reduce_sum(
                self.attention_v *
                math_ops.tanh(encoder_processed + query_processed), [2])
            alpha = nn_ops.softmax(scores, axis=1)
            output_hidden_size = self.encoder_outputs.shape[2].value
            alpha_tile = gen_array_ops.tile(array_ops.expand_dims(alpha, -1),
                                            [1, 1, output_hidden_size],
                                            name="weight")
            # print(weight_tile) # batch_size x num x embedding_size
            weighted_sum = math_ops.reduce_sum(self.encoder_outputs *
                                               alpha_tile,
                                               axis=1)
        return self._cell(tf.concat([inputs, weighted_sum, mt], axis=1), state)
Example #35
0
def beam_rnn_decoder(decoder_inputs,
                     initial_state,
                     cell,
                     loop_function=None,
                     scope=None,
                     output_projection=None,
                     beam_size=1):
    """RNN decoder for the sequence-to-sequence model.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    loop_function: If not None, this function will be applied to the i-th output
      in order to generate the i+1-st input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x output_size] containing generated outputs.
      state: The state of each cell at the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].
        (Note that in some cases, like basic RNN cell or GRU cell, outputs and
         states can be the same. They are different for LSTM cells though.)
  """
    with variable_scope.variable_scope(scope or "rnn_decoder"):
        state = initial_state
        outputs = []
        prev = None
        log_beam_probs, beam_path, beam_symbols = [], [], []
        state_size = int(initial_state.get_shape().with_rank(2)[1])

        for i, inp in enumerate(decoder_inputs):
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp = loop_function(prev, i, log_beam_probs, beam_path,
                                        beam_symbols)
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()

            input_size = inp.get_shape().with_rank(2)[1]
            x = inp
            output, state = cell(x, state)

            if loop_function is not None:
                prev = output
            if i == 0:
                states = []
                for kk in range(beam_size):
                    states.append(state)
                state = tf.reshape(tf.concat(0, states), [-1, state_size])

            outputs.append(
                tf.argmax(nn_ops.xw_plus_b(output, output_projection[0],
                                           output_projection[1]),
                          dimension=1))
    return outputs, state, tf.reshape(tf.concat(0, beam_path),
                                      [-1, beam_size]), tf.reshape(
                                          tf.concat(0, beam_symbols),
                                          [-1, beam_size])
Example #36
0
 def loop_function(prev, _):
     if output_projection is not None:
         prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                 output_projection[1])
     return prev
Example #37
0
def loop_function(prev, out_proj, embedding):
    prev = nn_ops.xw_plus_b(prev, out_proj[0], out_proj[1])
    prev_symbol = math_ops.argmax(prev, axis=1)
    emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
    return [emb_prev, prev_symbol]
Example #38
0
    def _BuildAndTestMiniMNIST(self, param_index, tag):
        # Fix seed to avoid occasional flakiness
        np.random.seed(6)

        # Hyperparameters
        batch = 3
        inputs = 16
        features = 32
        classes = 10

        # Define the parameters
        inp_data = np.random.random_sample(inputs * batch)
        hidden_weight_data = np.random.randn(
            inputs * features) / np.sqrt(inputs)
        hidden_bias_data = np.random.random_sample(features)
        sm_weight_data = np.random.randn(
            features * classes) / np.sqrt(features)
        sm_bias_data = np.random.random_sample(classes)

        # special care for labels since they need to be normalized per batch
        label_data = np.random.random(batch * classes).reshape(
            (batch, classes))
        s = label_data.sum(axis=1)
        label_data /= s[:, None]

        with self.session(use_gpu=True):
            # We treat the inputs as "parameters" here
            inp = constant_op.constant(inp_data.tolist(),
                                       shape=[batch, inputs],
                                       dtype=dtypes.float64,
                                       name="inp")
            hidden_weight = constant_op.constant(hidden_weight_data.tolist(),
                                                 shape=[inputs, features],
                                                 dtype=dtypes.float64,
                                                 name="hidden_weight")
            hidden_bias = constant_op.constant(hidden_bias_data.tolist(),
                                               shape=[features],
                                               dtype=dtypes.float64,
                                               name="hidden_bias")
            softmax_weight = constant_op.constant(sm_weight_data.tolist(),
                                                  shape=[features, classes],
                                                  dtype=dtypes.float64,
                                                  name="softmax_weight")
            softmax_bias = constant_op.constant(sm_bias_data.tolist(),
                                                shape=[classes],
                                                dtype=dtypes.float64,
                                                name="softmax_bias")

            # List all the parameter so that we can test them one at a time
            all_params = [
                inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias
            ]
            param_sizes = [
                [batch, inputs],  # inp
                [inputs, features],  # hidden_weight,
                [features],  # hidden_bias
                [features, classes],  # softmax_weight,
                [classes]
            ]  # softmax_bias

            # Now, Building MNIST
            features = nn_ops.relu(nn_ops.xw_plus_b(inp, hidden_weight,
                                                    hidden_bias),
                                   name="features")
            logits = nn_ops.xw_plus_b(features,
                                      softmax_weight,
                                      softmax_bias,
                                      name="logits")
            labels = constant_op.constant(label_data.tolist(),
                                          shape=[batch, classes],
                                          dtype=dtypes.float64,
                                          name="labels")
            cost = nn_ops.softmax_cross_entropy_with_logits(labels=labels,
                                                            logits=logits,
                                                            name="cost")

            # Test the gradients.
            err = gradient_checker.compute_gradient_error(
                all_params[param_index],
                param_sizes[param_index],
                cost, [batch],
                delta=1e-5)

        tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
        return err
  def _BuildAndTestMiniMNIST(self, param_index, tag):
    # Fix seed to avoid occasional flakiness
    np.random.seed(6)

    # Hyperparameters
    batch = 3
    inputs = 16
    features = 32
    classes = 10

    # Define the parameters
    inp_data = np.random.random_sample(inputs * batch)
    hidden_weight_data = np.random.randn(inputs * features) / np.sqrt(inputs)
    hidden_bias_data = np.random.random_sample(features)
    sm_weight_data = np.random.randn(features * classes) / np.sqrt(features)
    sm_bias_data = np.random.random_sample(classes)

    # special care for labels since they need to be normalized per batch
    label_data = np.random.random(batch * classes).reshape((batch, classes))
    s = label_data.sum(axis=1)
    label_data /= s[:, None]

    with self.session(use_gpu=True):
      # We treat the inputs as "parameters" here
      inp = constant_op.constant(
          inp_data.tolist(),
          shape=[batch, inputs],
          dtype=dtypes.float64,
          name="inp")
      hidden_weight = constant_op.constant(
          hidden_weight_data.tolist(),
          shape=[inputs, features],
          dtype=dtypes.float64,
          name="hidden_weight")
      hidden_bias = constant_op.constant(
          hidden_bias_data.tolist(),
          shape=[features],
          dtype=dtypes.float64,
          name="hidden_bias")
      softmax_weight = constant_op.constant(
          sm_weight_data.tolist(),
          shape=[features, classes],
          dtype=dtypes.float64,
          name="softmax_weight")
      softmax_bias = constant_op.constant(
          sm_bias_data.tolist(),
          shape=[classes],
          dtype=dtypes.float64,
          name="softmax_bias")

      # List all the parameter so that we can test them one at a time
      all_params = [
          inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias
      ]
      param_sizes = [
          [batch, inputs],  # inp
          [inputs, features],  # hidden_weight,
          [features],  # hidden_bias
          [features, classes],  # softmax_weight,
          [classes]
      ]  # softmax_bias

      # Now, Building MNIST
      features = nn_ops.relu(
          nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features")
      logits = nn_ops.xw_plus_b(
          features, softmax_weight, softmax_bias, name="logits")
      labels = constant_op.constant(
          label_data.tolist(),
          shape=[batch, classes],
          dtype=dtypes.float64,
          name="labels")
      cost = nn_ops.softmax_cross_entropy_with_logits(
          labels=labels, logits=logits, name="cost")

      # Test the gradients.
      err = gradient_checker.compute_gradient_error(
          all_params[param_index],
          param_sizes[param_index],
          cost, [batch],
          delta=1e-5)

    tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
    return err