Example #1
0
 def rnn_estimator(x, y):
   """RNN estimator with target predictor function on top."""
   x = input_op_fn(x)
   if cell_type == 'rnn':
     cell_fn = nn.rnn_cell.BasicRNNCell
   elif cell_type == 'gru':
     cell_fn = nn.rnn_cell.GRUCell
   elif cell_type == 'lstm':
     cell_fn = nn.rnn_cell.BasicLSTMCell
   else:
     raise ValueError('cell_type {} is not supported. '.format(cell_type))
   if bidirectional:
     # forward direction cell
     rnn_fw_cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] * num_layers)
     # backward direction cell
     rnn_bw_cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] * num_layers)
     # pylint: disable=unexpected-keyword-arg, no-value-for-parameter
     _, encoding = bidirectional_rnn(rnn_fw_cell,
                                     rnn_bw_cell,
                                     x,
                                     dtype=dtypes.float32,
                                     sequence_length=sequence_length,
                                     initial_state_fw=initial_state,
                                     initial_state_bw=initial_state)
   else:
     cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] * num_layers)
     _, encoding = nn.rnn(cell,
                          x,
                          dtype=dtypes.float32,
                          sequence_length=sequence_length,
                          initial_state=initial_state)
   return target_predictor_fn(encoding, y)
Example #2
0
 def rnn_estimator(X, y):
     """RNN estimator with target predictor function on top."""
     X = input_op_fn(X)
     if cell_type == 'rnn':
         cell_fn = nn.rnn_cell.BasicRNNCell
     elif cell_type == 'gru':
         cell_fn = nn.rnn_cell.GRUCell
     elif cell_type == 'lstm':
         cell_fn = nn.rnn_cell.BasicLSTMCell
     else:
         raise ValueError(
             "cell_type {} is not supported. ".format(cell_type))
     if bidirectional:
         # forward direction cell
         rnn_fw_cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] *
                                                num_layers)
         # backward direction cell
         rnn_bw_cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] *
                                                num_layers)
         # pylint: disable=unexpected-keyword-arg, no-value-for-parameter
         _, encoding = bidirectional_rnn(rnn_fw_cell,
                                         rnn_bw_cell,
                                         X,
                                         dtype=dtypes.float32,
                                         sequence_length=sequence_length,
                                         initial_state_fw=initial_state,
                                         initial_state_bw=initial_state)
     else:
         cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] * num_layers)
         _, encoding = nn.rnn(cell,
                              X,
                              dtype=dtypes.float32,
                              sequence_length=sequence_length,
                              initial_state=initial_state)
     return target_predictor_fn(encoding, y)
Example #3
0
 def rnn_estimator(x, y):
     """RNN estimator with target predictor function on top."""
     x = input_op_fn(x)
     if cell_type == 'rnn':
         cell_fn = nn.rnn_cell.BasicRNNCell
     elif cell_type == 'gru':
         cell_fn = nn.rnn_cell.GRUCell
     elif cell_type == 'lstm':
         cell_fn = nn.rnn_cell.BasicLSTMCell
     else:
         raise ValueError(
             'cell_type {} is not supported. '.format(cell_type))
     # TODO: state_is_tuple=False is deprecated
     if bidirectional:
         # forward direction cell
         fw_cell = cell_fn(rnn_size)
         bw_cell = cell_fn(rnn_size)
         # attach attention cells if specified
         if attn_length is not None:
             fw_cell = contrib_rnn.AttentionCellWrapper(
                 fw_cell,
                 attn_length=attn_length,
                 attn_size=attn_size,
                 attn_vec_size=attn_vec_size,
                 state_is_tuple=False)
             bw_cell = contrib_rnn.AttentionCellWrapper(
                 fw_cell,
                 attn_length=attn_length,
                 attn_size=attn_size,
                 attn_vec_size=attn_vec_size,
                 state_is_tuple=False)
         rnn_fw_cell = nn.rnn_cell.MultiRNNCell([fw_cell] * num_layers)
         # backward direction cell
         rnn_bw_cell = nn.rnn_cell.MultiRNNCell([bw_cell] * num_layers)
         # pylint: disable=unexpected-keyword-arg, no-value-for-parameter
         _, encoding = bidirectional_rnn(rnn_fw_cell,
                                         rnn_bw_cell,
                                         x,
                                         dtype=dtypes.float32,
                                         sequence_length=sequence_length,
                                         initial_state_fw=initial_state,
                                         initial_state_bw=initial_state)
     else:
         rnn_cell = cell_fn(rnn_size)
         if attn_length is not None:
             rnn_cell = contrib_rnn.AttentionCellWrapper(
                 rnn_cell,
                 attn_length=attn_length,
                 attn_size=attn_size,
                 attn_vec_size=attn_vec_size,
                 state_is_tuple=False)
         cell = nn.rnn_cell.MultiRNNCell([rnn_cell] * num_layers)
         _, encoding = nn.rnn(cell,
                              x,
                              dtype=dtypes.float32,
                              sequence_length=sequence_length,
                              initial_state=initial_state)
     return target_predictor_fn(encoding, y)
Example #4
0
 def rnn_estimator(x, y):
   """RNN estimator with target predictor function on top."""
   x = input_op_fn(x)
   if cell_type == 'rnn':
     cell_fn = nn.rnn_cell.BasicRNNCell
   elif cell_type == 'gru':
     cell_fn = nn.rnn_cell.GRUCell
   elif cell_type == 'lstm':
     cell_fn = functools.partial(
         nn.rnn_cell.BasicLSTMCell, state_is_tuple=False)
   else:
     raise ValueError('cell_type {} is not supported. '.format(cell_type))
   # TODO: state_is_tuple=False is deprecated
   if bidirectional:
     # forward direction cell
     fw_cell = cell_fn(rnn_size)
     bw_cell = cell_fn(rnn_size)
     # attach attention cells if specified
     if attn_length is not None:
       fw_cell = contrib_rnn.AttentionCellWrapper(
         fw_cell, attn_length=attn_length, attn_size=attn_size,
         attn_vec_size=attn_vec_size, state_is_tuple=False)
       bw_cell = contrib_rnn.AttentionCellWrapper(
         fw_cell, attn_length=attn_length, attn_size=attn_size,
         attn_vec_size=attn_vec_size, state_is_tuple=False)
     rnn_fw_cell = nn.rnn_cell.MultiRNNCell([fw_cell] * num_layers,
                                            state_is_tuple=False)
     # backward direction cell
     rnn_bw_cell = nn.rnn_cell.MultiRNNCell([bw_cell] * num_layers,
                                            state_is_tuple=False)
     # pylint: disable=unexpected-keyword-arg, no-value-for-parameter
     _, encoding = bidirectional_rnn(rnn_fw_cell,
                                     rnn_bw_cell,
                                     x,
                                     dtype=dtypes.float32,
                                     sequence_length=sequence_length,
                                     initial_state_fw=initial_state,
                                     initial_state_bw=initial_state)
   else:
     rnn_cell = cell_fn(rnn_size)
     if attn_length is not None:
       rnn_cell = contrib_rnn.AttentionCellWrapper(
           rnn_cell, attn_length=attn_length, attn_size=attn_size,
           attn_vec_size=attn_vec_size, state_is_tuple=False)
     cell = nn.rnn_cell.MultiRNNCell([rnn_cell] * num_layers,
                                     state_is_tuple=False)
     _, encoding = nn.rnn(cell,
                          x,
                          dtype=dtypes.float32,
                          sequence_length=sequence_length,
                          initial_state=initial_state)
   return target_predictor_fn(encoding, y)
def rnn_seq2seq(encoder_inputs, decoder_inputs, encoder_cell, decoder_cell=None,
                dtype=dtypes.float32, scope=None):
    """RNN Sequence to Sequence model.

    Args:
        encoder_inputs: List of tensors, inputs for encoder.
        decoder_inputs: List of tensors, inputs for decoder.
        encoder_cell: RNN cell to use for encoder.
        decoder_cell: RNN cell to use for decoder, if None encoder_cell is used.
        dtype: Type to initialize encoder state with.
        scope: Scope to use, if None new will be produced.

    Returns:
        List of tensors for outputs and states for trianing and sampling sub-graphs.
    """
    with vs.variable_scope(scope or "rnn_seq2seq"):
        _, last_enc_state = nn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
        return rnn_decoder(decoder_inputs, last_enc_state, decoder_cell or encoder_cell)
Example #6
0
def rnn_seq2seq(encoder_inputs,
                decoder_inputs,
                encoder_cell,
                decoder_cell=None,
                dtype=dtypes.float32,
                scope=None):
  """RNN Sequence to Sequence model.

  Args:
    encoder_inputs: List of tensors, inputs for encoder.
    decoder_inputs: List of tensors, inputs for decoder.
    encoder_cell: RNN cell to use for encoder.
    decoder_cell: RNN cell to use for decoder, if None encoder_cell is used.
    dtype: Type to initialize encoder state with.
    scope: Scope to use, if None new will be produced.

  Returns:
    List of tensors for outputs and states for trianing and sampling sub-graphs.
  """
  with vs.variable_scope(scope or "rnn_seq2seq"):
    _, last_enc_state = nn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
    return rnn_decoder(decoder_inputs, last_enc_state, decoder_cell or
                       encoder_cell)
Example #7
0
def bidirectional_rnn(cell_fw,
                      cell_bw,
                      inputs,
                      initial_state_fw=None,
                      initial_state_bw=None,
                      dtype=None,
                      sequence_length=None,
                      scope=None):
    """Creates a bidirectional recurrent neural network.
    Similar to the unidirectional case (rnn) but takes input and builds
    independent forward and backward RNNs with the final forward and backward
    outputs depth-concatenated, such that the output will have the format
    [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of
    forward and backward cell must match. The initial state for both directions
    is zero by default (but can be set optionally) and no intermediate states are
    ever returned -- the network is fully unrolled for the given (passed in)
    length(s) of the sequence(s) or completely unrolled if length(s) is not given.
    Args:
        cell_fw: An instance of RNNCell, to be used for forward direction.
        cell_bw: An instance of RNNCell, to be used for backward direction.
        inputs: A length T list of inputs, each a tensor of shape
          [batch_size, cell.input_size].
        initial_state_fw: (optional) An initial state for the forward RNN.
          This must be a tensor of appropriate type and shape
          [batch_size x cell.state_size].
        initial_state_bw: (optional) Same as for initial_state_fw.
        dtype: (optional) The data type for the initial state.  Required if either
          of the initial states are not provided.
        sequence_length: (optional) An int64 vector (tensor) of size [batch_size],
          containing the actual lengths for each of the sequences.
        scope: VariableScope for the created subgraph; defaults to "BiRNN"
    Returns:
        A pair (outputs, state) where:
          outputs is a length T list of outputs (one for each input), which
            are depth-concatenated forward and backward outputs
          state is the concatenated final state of the forward and backward RNN
    Raises:
        TypeError: If "cell_fw" or "cell_bw" is not an instance of RNNCell.
        ValueError: If inputs is None or an empty list.
    """

    if not isinstance(cell_fw, nn.rnn_cell.RNNCell):
        raise TypeError("cell_fw must be an instance of RNNCell")
    if not isinstance(cell_bw, nn.rnn_cell.RNNCell):
        raise TypeError("cell_bw must be an instance of RNNCell")
    if not isinstance(inputs, list):
        raise TypeError("inputs must be a list")
    if not inputs:
        raise ValueError("inputs must not be empty")

    name = scope or "BiRNN"
    # Forward direction
    with vs.variable_scope(name + "_FW"):
        output_fw, state_fw = nn.rnn(cell_fw, inputs, initial_state_fw, dtype,
                                     sequence_length)

    # Backward direction
    with vs.variable_scope(name + "_BW"):
        tmp, state_bw = nn.rnn(cell_bw, _reverse_seq(inputs, sequence_length),
                               initial_state_bw, dtype, sequence_length)
    output_bw = _reverse_seq(tmp, sequence_length)
    # Concat each of the forward/backward outputs
    outputs = [
        array_ops_.concat(1, [fw, bw]) for fw, bw in zip(output_fw, output_bw)
    ]

    return outputs, array_ops_.concat(1, [state_fw, state_bw])
Example #8
0
def bidirectional_rnn(cell_fw,
                      cell_bw,
                      inputs,
                      initial_state_fw=None,
                      initial_state_bw=None,
                      dtype=None,
                      sequence_length=None,
                      scope=None):
  """Creates a bidirectional recurrent neural network.

  Similar to the unidirectional case (rnn) but takes input and builds
  independent forward and backward RNNs with the final forward and backward
  outputs depth-concatenated, such that the output will have the format
  [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of
  forward and backward cell must match. The initial state for both directions
  is zero by default (but can be set optionally) and no intermediate states
  are ever returned -- the network is fully unrolled for the given (passed in)
  length(s) of the sequence(s) or completely unrolled if length(s) is not
  given.
  Args:
    cell_fw: An instance of RNNCell, to be used for forward direction.
    cell_bw: An instance of RNNCell, to be used for backward direction.
    inputs: A length T list of inputs, each a tensor of shape
      [batch_size, cell.input_size].
    initial_state_fw: (optional) An initial state for the forward RNN.
      This must be a tensor of appropriate type and shape
      [batch_size x cell.state_size].
    initial_state_bw: (optional) Same as for initial_state_fw.
    dtype: (optional) The data type for the initial state.  Required if
      either of the initial states are not provided.
    sequence_length: (optional) An int64 vector (tensor) of size
      [batch_size],
      containing the actual lengths for each of the sequences.
    scope: VariableScope for the created subgraph; defaults to "BiRNN"

  Returns:
    A pair (outputs, state) where:
      outputs is a length T list of outputs (one for each input), which
      are depth-concatenated forward and backward outputs
      state is the concatenated final state of the forward and backward RNN

  Raises:
    TypeError: If "cell_fw" or "cell_bw" is not an instance of RNNCell.
    ValueError: If inputs is None or an empty list.
  """

  if not isinstance(cell_fw, nn.rnn_cell.RNNCell):
    raise TypeError('cell_fw must be an instance of RNNCell')
  if not isinstance(cell_bw, nn.rnn_cell.RNNCell):
    raise TypeError('cell_bw must be an instance of RNNCell')
  if not isinstance(inputs, list):
    raise TypeError('inputs must be a list')
  if not inputs:
    raise ValueError('inputs must not be empty')

  name = scope or 'BiRNN'
  # Forward direction
  with vs.variable_scope(name + '_FW'):
    output_fw, state_fw = nn.rnn(cell_fw, inputs, initial_state_fw, dtype,
                                 sequence_length)

  # Backward direction
  with vs.variable_scope(name + '_BW'):
    tmp, state_bw = nn.rnn(cell_bw, _reverse_seq(inputs, sequence_length),
                           initial_state_bw, dtype, sequence_length)
  output_bw = _reverse_seq(tmp, sequence_length)
  # Concat each of the forward/backward outputs
  outputs = [array_ops_.concat(1, [fw, bw])
             for fw, bw in zip(output_fw, output_bw)]

  return outputs, array_ops_.concat(1, [state_fw, state_bw])
    def __init__(self, sequence_length, vocab_size, embedding_size, hidden_size,
                 layer_count=1, **kw):
        assert layer_count >= 1, "An LSTM cannot have less than one layer."
        n_classes = kw.get('n_classes', 2)  # >2 not tested.
        self.input_x = tf.placeholder(tf.int32,
                                      [None, sequence_length],
                                      name="input_x")
        self.input_y = tf.placeholder(tf.float32,
                                      [None, n_classes],
                                      name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")

        # Layer 1: Word embeddings
        self.embeddings = tf.Variable(
            tf.random_uniform([vocab_size, embedding_size], -0.1, 0.1),
            name="embeddings")
        embedded_words = tf.nn.embedding_lookup(self.embeddings, self.input_x)

        # Funnel the words into the LSTM.
        # Current size: (batch_size, n_words, emb_dim)
        # Want:         [(batch_size, n_hidden) * n_words]
        #
        # Since otherwise there's no way to feed information into the LSTM cell.
        # Yes, it's a bit confusing, because we want a batch of multiple
        # sequences, with each step being of 'embedding_size'.
        embedded_words = tf.transpose(embedded_words, [1, 0, 2])
        embedded_words = tf.reshape(embedded_words, [-1, embedding_size])
        # Note: 'tf.split' outputs a **Python** list.
        embedded_words = tf.split(0, sequence_length, embedded_words)

        # Layer 2: LSTM cell
        lstm_use_peepholes = True
        # 'state_is_tuple = True' should NOT be used despite the warnings
        # (which appear as of TF 0.9), since it doesn't work on the version of
        # TF installed on Euler (0.8).
        if layer_count > 1:
            print("Using deep {0}-layer LSTM with first layer size {1}"
                  " (embedding size) and hidden layer size {2}."
                  .format(layer_count, embedding_size, hidden_size))
            print("First cell {0}->{1}".format(embedding_size, embedding_size))
            first_cell = TextLSTM._cell(embedding_size,
                                        embedding_size,
                                        lstm_use_peepholes,
                                        self.dropout_keep_prob)
            print("Second cell {0}->{1}".format(embedding_size, hidden_size))
            second_cell = TextLSTM._cell(embedding_size,
                                         hidden_size,
                                         lstm_use_peepholes,
                                         self.dropout_keep_prob)
            print("Third cell+ {0}->{1} (if applicable)".format(hidden_size,
                                                                hidden_size))
            third_plus = TextLSTM._cell(hidden_size,
                                        hidden_size,
                                        lstm_use_peepholes,
                                        self.dropout_keep_prob)
            deep_cells = [third_plus] * (layer_count - 2)
            lstm_cells = rnn_cell.MultiRNNCell([first_cell, second_cell] +
                                               deep_cells)
        else:
            print("Using simple 1-layer LSTM with hidden layer size {0}."
                  .format(hidden_size))
            lstm_cells = rnn_cell.LSTMCell(num_units=hidden_size,
                                           input_size=embedding_size,
                                           forget_bias=1.0,
                                           use_peepholes=lstm_use_peepholes)

        # Q: Can't batches end up containing both positive and negative labels?
        #    Can the LSTM batch training deal with this?
        #
        # A: Yes. Each batch feeds each sentence into the LSTM, incurs the loss,
        #    and backpropagates the error separately. Each example in a bath
        #    is independent. Note that as opposed to language models, for
        #    instance, where we incur a loss for all outputs, in this case we
        #    only care about the final output of the RNN, since it doesn't make
        #    sense to classify incomplete tweets.

        outputs, _states = rnn(lstm_cells,
                               inputs=embedded_words,
                               dtype=tf.float32)

        # Layer 3: Final Softmax
        out_weight = tf.Variable(tf.random_normal([hidden_size, n_classes]))
        out_bias = tf.Variable(tf.random_normal([n_classes]))

        with tf.name_scope("output"):
            lstm_final_output = outputs[-1]
            self.scores = tf.nn.xw_plus_b(lstm_final_output, out_weight,
                                          out_bias, name="scores")
            self.predictions = tf.nn.softmax(self.scores, name="predictions")

        with tf.name_scope("loss"):
            self.losses = tf.nn.softmax_cross_entropy_with_logits(self.scores,
                                                                  self.input_y)
            self.loss = tf.reduce_mean(self.losses, name="loss")

        with tf.name_scope("accuracy"):
            self.correct_pred = tf.equal(tf.argmax(self.predictions, 1),
                                         tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, "float"),
                                           name="accuracy")
Example #10
0
    def __init__(self, config, is_training=False):
        self.config = config
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        self.hidden_size = hidden_size = config.hidden_size
        self.num_layers = 1
        vocab_size = config.vocab_size
        self.max_grad_norm = config.max_grad_norm
        self.use_lstm = config.use_lstm

        # Placeholders for inputs.
        self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])
        self.initial_state = array_ops.zeros(
            array_ops.pack([self.batch_size, self.num_steps]),
            dtype=tf.float32).set_shape([None, self.num_steps])

        embedding = tf.get_variable(
            'embedding', [self.config.vocab_size, self.config.hidden_size])

        # Set up ACT cell and inner rnn-type cell for use inside the ACT cell.
        with tf.variable_scope("rnn"):
            if self.use_lstm:
                inner_cell = rnn_cell.BasicLSTMCell(self.config.hidden_size)
            else:
                inner_cell = rnn_cell.GRUCell(self.config.hidden_size)

        with tf.variable_scope("ACT"):

            act = ACTCell(self.config.hidden_size,
                          inner_cell,
                          config.epsilon,
                          max_computation=config.max_computation,
                          batch_size=self.batch_size)

        inputs = tf.nn.embedding_lookup(embedding, self.input_data)
        inputs = [
            tf.squeeze(single_input, [1])
            for single_input in tf.split(1, self.config.num_steps, inputs)
        ]

        self.outputs, final_state = rnn(act, inputs, dtype=tf.float32)

        # Softmax to get probability distribution over vocab.
        output = tf.reshape(tf.concat(1, self.outputs), [-1, hidden_size])
        softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        self.logits = tf.matmul(
            output,
            softmax_w) + softmax_b  # dim (numsteps*batchsize, vocabsize)

        loss = seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([batch_size * num_steps])], vocab_size)

        # Add up loss and retrieve batch-normalised ponder cost: sum N + sum Remainder.
        ponder_cost = act.calculate_ponder_cost(
            time_penalty=self.config.ponder_time_penalty)
        self.cost = (tf.reduce_sum(loss) / batch_size) + ponder_cost
        self.final_state = self.outputs[-1]

        if is_training:
            self.lr = tf.Variable(0.0, trainable=False)
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                              self.max_grad_norm)
            optimizer = tf.train.AdamOptimizer(self.config.learning_rate)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Example #11
0
    def ACTStep(self,batch_mask,prob_compare,prob,counter,state,input,acc_outputs,acc_states):

        # General idea: generate halting probabilites and accumulate them. Stop when the accumulated probs
        # reach a halting value, 1-eps. At each timestep, multiply the prob with the rnn output/state.
        # There is a subtlety here regarding the batch_size, as clearly we will have examples halting
        # at different points in the batch. This is dealt with using logical masks to protect accumulated
        # probabilities, states and outputs from a timestep t's contribution if they have already reached
        # 1-es at a timstep s < t. On the last timestep, the remainder of every example in the batch is
        # multiplied with the state/output, having been accumulated over the timesteps and correctly carried
        # through for all examples, regardless of #overall batch timesteps.


        # if all the probs are zero, we are seeing a new input => binary flag := 1, else 0.
        binary_flag = tf.cond(tf.reduce_all(tf.equal(prob,0.0)),
                              lambda: tf.ones([self.batch_size,1],dtype=tf.float32),
                              lambda: tf.zeros([self.batch_size,1],tf.float32))

        input_with_flags = tf.concat(1, [binary_flag,input])
        output, new_state = rnn(self.cell, [input_with_flags], state, scope=type(self.cell).__name__)


        with tf.variable_scope('sigmoid_activation_for_pondering'):
            p = tf.squeeze(tf.sigmoid(tf.nn.rnn_cell._linear(new_state, 1, True)))

        # multiply by the previous mask as if we stopped before, we don't want to start again
        # if we generate a p less than p_t-1 for a given example.
        new_batch_mask = tf.logical_and(tf.less(prob + p,self.one_minus_eps),batch_mask)

        new_float_mask = tf.cast(new_batch_mask, tf.float32)


        # only increase the prob accumulator for the examples
        # which haven't already passed the threshold. This
        # means that we can just use the final prob value per
        # example to determine the remainder.
        prob += p * new_float_mask

        # this accumulator is used solely in the While loop condition.
        # we multiply by the PREVIOUS batch mask, to capture probabilities
        # that have gone over 1-eps THIS iteration.
        prob_compare += p * tf.cast(batch_mask, tf.float32)

        def use_remainder():

            # runs on the last iteration of while loop. prob now contains
            # exactly the probability at N-1, ie the timestep before we
            # go over 1-eps for all elements of the batch.

            remainder = tf.constant(1.0, tf.float32,[self.batch_size]) - prob
            remainder_expanded = tf.expand_dims(remainder,1)
            tiled_remainder = tf.tile(remainder_expanded,[1,self.output_size])

            acc_state = (new_state * tiled_remainder) + acc_states
            acc_output = (output[0] * tiled_remainder) + acc_outputs
            return acc_state, acc_output

        def normal():

            # accumulate normally, by multiplying the batch
            # probs with the output and state of the rnn.
            # If we passed the 1-eps threshold this round, we
            # have a zero in the batch mask, so we add no contribution
            # to acc_state or acc_output

            p_expanded = tf.expand_dims(p * new_float_mask,1)
            tiled_p = tf.tile(p_expanded,[1,self.output_size])

            acc_state = (new_state * tiled_p) + acc_states
            acc_output = (output[0] * tiled_p) + acc_outputs
            return acc_state, acc_output

        # only increase the counter for those probabilities that
        # did not go over 1-eps in this iteration.
        counter += tf.constant(1.0,tf.float32,[self.batch_size]) * new_float_mask


        # halting condition(halts, and uses the remainder when this is FALSE):
        # if the batch mask is all zeros, then all batches have finished.
        # if any batch element still has both a prob < 1-eps AND counter< N we continue.

        counter_condition = tf.less(counter,self.N)
        condition = tf.reduce_any(tf.logical_and(new_batch_mask,counter_condition))

        acc_state, acc_output = tf.cond(condition, normal, use_remainder)


        return [new_batch_mask,prob_compare,prob,counter,new_state, input, acc_output,acc_state]
Example #12
0
    def ACTStep(self, batch_mask, prob_compare, prob, counter, state, input,
                acc_outputs, acc_states):

        # General idea: generate halting probabilites and accumulate them. Stop when the accumulated probs
        # reach a halting value, 1-eps. At each timestep, multiply the prob with the rnn output/state.
        # There is a subtlety here regarding the batch_size, as clearly we will have examples halting
        # at different points in the batch. This is dealt with using logical masks to protect accumulated
        # probabilities, states and outputs from a timestep t's contribution if they have already reached
        # 1-es at a timstep s < t. On the last timestep, the remainder of every example in the batch is
        # multiplied with the state/output, having been accumulated over the timesteps and correctly carried
        # through for all examples, regardless of #overall batch timesteps.

        # if all the probs are zero, we are seeing a new input => binary flag := 1, else 0.
        binary_flag = tf.cond(
            tf.reduce_all(tf.equal(prob, 0.0)),
            lambda: tf.ones([self.batch_size, 1], dtype=tf.float32),
            lambda: tf.zeros([self.batch_size, 1], tf.float32))

        input_with_flags = tf.concat(1, [binary_flag, input])

        output, new_state = rnn(self.cell, [input_with_flags],
                                state,
                                scope=type(self.cell).__name__)

        with tf.variable_scope('sigmoid_activation_for_pondering'):
            p = tf.squeeze(
                tf.sigmoid(tf.nn.rnn_cell._linear(new_state, 1, True)))

        # multiply by the previous mask as if we stopped before, we don't want to start again
        # if we generate a p less than p_t-1 for a given example.
        new_batch_mask = tf.logical_and(tf.less(prob + p, self.one_minus_eps),
                                        batch_mask)

        new_float_mask = tf.cast(new_batch_mask, tf.float32)

        # only increase the prob accumulator for the examples
        # which haven't already passed the threshold. This
        # means that we can just use the final prob value per
        # example to determine the remainder.
        prob += p * new_float_mask

        # this accumulator is used solely in the While loop condition.
        # we multiply by the PREVIOUS batch mask, to capture probabilities
        # that have gone over 1-eps THIS iteration.
        prob_compare += p * tf.cast(batch_mask, tf.float32)

        def use_remainder():

            # runs on the last iteration of while loop. prob now contains
            # exactly the probability at N-1, ie the timestep before we
            # go over 1-eps for all elements of the batch.

            remainder = tf.constant(1.0, tf.float32, [self.batch_size]) - prob
            remainder_expanded = tf.expand_dims(remainder, 1)
            tiled_remainder = tf.tile(remainder_expanded,
                                      [1, self.output_size])

            acc_state = (new_state * tiled_remainder) + acc_states
            acc_output = (output[0] * tiled_remainder) + acc_outputs
            return acc_state, acc_output

        def normal():

            # accumulate normally, by multiplying the batch
            # probs with the output and state of the rnn.
            # If we passed the 1-eps threshold this round, we
            # have a zero in the batch mask, so we add no contribution
            # to acc_state or acc_output

            p_expanded = tf.expand_dims(p * new_float_mask, 1)
            tiled_p = tf.tile(p_expanded, [1, self.output_size])

            acc_state = (new_state * tiled_p) + acc_states
            acc_output = (output[0] * tiled_p) + acc_outputs
            return acc_state, acc_output

        # only increase the counter for those probabilities that
        # did not go over 1-eps in this iteration.
        counter += tf.constant(1.0, tf.float32,
                               [self.batch_size]) * new_float_mask

        # halting condition(halts, and uses the remainder when this is FALSE):
        # if the batch mask is all zeros, then all batches have finished.
        # if any batch element still has both a prob < 1-eps AND counter< N we continue.

        counter_condition = tf.less(counter, self.N)
        condition = tf.reduce_any(
            tf.logical_and(new_batch_mask, counter_condition))

        acc_state, acc_output = tf.cond(condition, normal, use_remainder)

        return [
            new_batch_mask, prob_compare, prob, counter, new_state, input,
            acc_output, acc_state
        ]
    def ACTStep(self, batch_mask, prob_compare, prob, counter, state, input,
                acc_outputs, acc_states):

        #TODO: leavesbreathe, implement batch norming hidden state?
        output, new_state = rnn(self.cell, [input],
                                state,
                                scope=type(self.cell).__name__)

        if self.use_lstm:
            input_for_eval_halting_p, _ = tf.split(1, 2, new_state)
        else:
            input_for_eval_halting_p = new_state
        p = self.CalculateHaltingProbability(input_for_eval_halting_p)

        # here we create a mask on the p vector, which we then multiply with the state/output.
        # if p[i] = 0, then we have passed the remainder point for that example, so we multiply
        # the state/output vector by this masked probability(which has zeros if the prob for
        # a batch has passed the stopping point) so we carry none of it forward.
        #  If, by adding p, we pass the boundary, we don't add p onto prob - this allows us to
        # use the use_remainder() as normal for all steps after ALL examples have taken their max time.

        # multiply by the previous mask as if we stopped before, we don't want to start again
        new_batch_mask = tf.logical_and(tf.less(prob + p, self.one_minus_eps),
                                        batch_mask)
        float_mask = tf.cast(new_batch_mask, tf.float32)

        # only increase the prob accumulator for the examples
        # which haven't already passed the threshold. This
        # means that we can just use the final prob value per
        # example to determine the remainder.
        prob += p * float_mask

        prob_compare += p * tf.cast(batch_mask, tf.float32)

        def use_remainder():
            remainder = tf.constant(1.0, tf.float32, [self.batch_size]) - prob
            remainder_expanded = tf.expand_dims(remainder, 1)
            #leavesbreathe commented out the tiling below for lstm implementation
            # tiled_remainder = tf.tile(remainder_expanded,[1,self.output_size])

            acc_state = tf.add(tf.mul(new_state, remainder_expanded),
                               acc_states)
            acc_output = tf.add(tf.mul(output[0], remainder_expanded),
                                acc_outputs)
            return acc_state, acc_output

        def normal():
            p_expanded = tf.expand_dims(p * float_mask, 1)
            # tiled_p = tf.tile(p_expanded,[1,self.output_size])

            acc_state = tf.add(tf.mul(new_state, p_expanded), acc_states)
            acc_output = tf.mul(output[0], p_expanded) + acc_outputs
            return acc_state, acc_output

        # halting condition: if the batch mask is all zeros, then all batches have finished.
        # therefore, if the sum of the mask = 0, then we use the remainder.
        counter += tf.constant(1.0, tf.float32, [self.batch_size]) * float_mask

        counter_condition = tf.less(counter, self.N)
        condition = tf.reduce_any(
            tf.logical_and(new_batch_mask, counter_condition))

        acc_state, acc_output = tf.cond(condition, normal, use_remainder)

        # only increment the counter for the examples which are still running
        # counter += tf.constant(1.0,tf.float32,[self.batch_size])
        return [
            new_batch_mask, prob_compare, prob, counter, new_state, input,
            acc_output, acc_state
        ]
    def __init__(self,
                 sequence_length,
                 vocab_size,
                 embedding_size,
                 hidden_size,
                 layer_count=1,
                 **kw):
        assert layer_count >= 1, "An LSTM cannot have less than one layer."
        n_classes = kw.get('n_classes', 2)  # >2 not tested.
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length],
                                      name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, n_classes],
                                      name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")

        # Layer 1: Word embeddings
        self.embeddings = tf.Variable(tf.random_uniform(
            [vocab_size, embedding_size], -0.1, 0.1),
                                      name="embeddings")
        embedded_words = tf.nn.embedding_lookup(self.embeddings, self.input_x)

        # Funnel the words into the LSTM.
        # Current size: (batch_size, n_words, emb_dim)
        # Want:         [(batch_size, n_hidden) * n_words]
        #
        # Since otherwise there's no way to feed information into the LSTM cell.
        # Yes, it's a bit confusing, because we want a batch of multiple
        # sequences, with each step being of 'embedding_size'.
        embedded_words = tf.transpose(embedded_words, [1, 0, 2])
        embedded_words = tf.reshape(embedded_words, [-1, embedding_size])
        # Note: 'tf.split' outputs a **Python** list.
        embedded_words = tf.split(0, sequence_length, embedded_words)

        # Layer 2: LSTM cell
        lstm_use_peepholes = True
        # 'state_is_tuple = True' should NOT be used despite the warnings
        # (which appear as of TF 0.9), since it doesn't work on the version of
        # TF installed on Euler (0.8).
        if layer_count > 1:
            print("Using deep {0}-layer LSTM with first layer size {1}"
                  " (embedding size) and hidden layer size {2}.".format(
                      layer_count, embedding_size, hidden_size))
            print("First cell {0}->{1}".format(embedding_size, embedding_size))
            first_cell = TextLSTM._cell(embedding_size, embedding_size,
                                        lstm_use_peepholes,
                                        self.dropout_keep_prob)
            print("Second cell {0}->{1}".format(embedding_size, hidden_size))
            second_cell = TextLSTM._cell(embedding_size, hidden_size,
                                         lstm_use_peepholes,
                                         self.dropout_keep_prob)
            print("Third cell+ {0}->{1} (if applicable)".format(
                hidden_size, hidden_size))
            third_plus = TextLSTM._cell(hidden_size, hidden_size,
                                        lstm_use_peepholes,
                                        self.dropout_keep_prob)
            deep_cells = [third_plus] * (layer_count - 2)
            lstm_cells = rnn_cell.MultiRNNCell([first_cell, second_cell] +
                                               deep_cells)
        else:
            print(
                "Using simple 1-layer LSTM with hidden layer size {0}.".format(
                    hidden_size))
            lstm_cells = rnn_cell.LSTMCell(num_units=hidden_size,
                                           input_size=embedding_size,
                                           forget_bias=1.0,
                                           use_peepholes=lstm_use_peepholes)

        # Q: Can't batches end up containing both positive and negative labels?
        #    Can the LSTM batch training deal with this?
        #
        # A: Yes. Each batch feeds each sentence into the LSTM, incurs the loss,
        #    and backpropagates the error separately. Each example in a bath
        #    is independent. Note that as opposed to language models, for
        #    instance, where we incur a loss for all outputs, in this case we
        #    only care about the final output of the RNN, since it doesn't make
        #    sense to classify incomplete tweets.

        outputs, _states = rnn(lstm_cells,
                               inputs=embedded_words,
                               dtype=tf.float32)

        # Layer 3: Final Softmax
        out_weight = tf.Variable(tf.random_normal([hidden_size, n_classes]))
        out_bias = tf.Variable(tf.random_normal([n_classes]))

        with tf.name_scope("output"):
            lstm_final_output = outputs[-1]
            self.scores = tf.nn.xw_plus_b(lstm_final_output,
                                          out_weight,
                                          out_bias,
                                          name="scores")
            self.predictions = tf.nn.softmax(self.scores, name="predictions")

        with tf.name_scope("loss"):
            self.losses = tf.nn.softmax_cross_entropy_with_logits(
                self.scores, self.input_y)
            self.loss = tf.reduce_mean(self.losses, name="loss")

        with tf.name_scope("accuracy"):
            self.correct_pred = tf.equal(tf.argmax(self.predictions, 1),
                                         tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, "float"),
                                           name="accuracy")
Example #15
0
    def act_step(self, batch_mask, prob_compare, prob, counter, state, input,
                 acc_outputs, acc_states):
        '''
        General idea: generate halting probabilites and accumulate them. Stop when the accumulated probs
        reach a halting value, 1-eps. At each timestep, multiply the prob with the rnn output/state.
        There is a subtlety here regarding the batch_size, as clearly we will have examples halting
        at different points in the batch. This is dealt with using logical masks to protect accumulated
        probabilities, states and outputs from a timestep t's contribution if they have already reached
        1 - es at a timstep s < t. On the last timestep for each element in the batch the remainder is
        multiplied with the state/output, having been accumulated over the timesteps, as this takes
        into account the epsilon value.
        '''

        # If all the probs are zero, we are seeing a new input => binary flag := 1, else 0.
        binary_flag = tf.cond(
            tf.reduce_all(tf.equal(prob, 0.0)),
            lambda: tf.ones([self.batch_size, 1], dtype=tf.float32),
            lambda: tf.zeros([self.batch_size, 1], tf.float32))

        input_with_flags = tf.concat(1, [binary_flag, input])
        output, new_state = rnn(self.cell, [input_with_flags],
                                state,
                                scope=type(self.cell).__name__)

        with tf.variable_scope('sigmoid_activation_for_pondering'):
            p = tf.squeeze(
                tf.sigmoid(tf.nn.rnn_cell._linear(new_state, 1, True)))

        # Multiply by the previous mask as if we stopped before, we don't want to start again
        # if we generate a p less than p_t-1 for a given example.
        new_batch_mask = tf.logical_and(tf.less(prob + p, self.one_minus_eps),
                                        batch_mask)
        new_float_mask = tf.cast(new_batch_mask, tf.float32)

        # Only increase the prob accumulator for the examples
        # which haven't already passed the threshold. This
        # means that we can just use the final prob value per
        # example to determine the remainder.
        prob += p * new_float_mask

        # This accumulator is used solely in the While loop condition.
        # we multiply by the PREVIOUS batch mask, to capture probabilities
        # that have gone over 1-eps THIS iteration.
        prob_compare += p * tf.cast(batch_mask, tf.float32)

        # Only increase the counter for those probabilities that
        # did not go over 1-eps in this iteration.
        counter += new_float_mask

        # Halting condition (halts, and uses the remainder when this is FALSE):
        # If any batch element still has both a prob < 1 - epsilon AND counter < N we
        # continue, using the outputed probability p.
        counter_condition = tf.less(counter, self.N)

        final_iteration_condition = tf.logical_and(new_batch_mask,
                                                   counter_condition)
        use_remainder = tf.expand_dims(1.0 - prob, -1)
        use_probability = tf.expand_dims(p, -1)
        update_weight = tf.select(final_iteration_condition, use_probability,
                                  use_remainder)
        float_mask = tf.expand_dims(tf.cast(batch_mask, tf.float32), -1)

        acc_state = (new_state * update_weight * float_mask) + acc_states
        acc_output = (output[0] * update_weight * float_mask) + acc_outputs

        return [
            new_batch_mask, prob_compare, prob, counter, new_state, input,
            acc_output, acc_state
        ]