Example #1
0
  def testWeightSpecificSparsity(self):
    param_list = [
        "begin_pruning_step=1", "pruning_frequency=1", "end_pruning_step=100",
        "target_sparsity=0.5", "weight_sparsity_map=[layer2/weights:0.75]",
        "threshold_decay=0.0"
    ]
    test_spec = ",".join(param_list)
    pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)

    with variable_scope.variable_scope("layer1"):
      w1 = variables.Variable(
          math_ops.linspace(1.0, 100.0, 100), name="weights")
      _ = pruning.apply_mask(w1)
    with variable_scope.variable_scope("layer2"):
      w2 = variables.Variable(
          math_ops.linspace(1.0, 100.0, 100), name="weights")
      _ = pruning.apply_mask(w2)

    p = pruning.Pruning(pruning_hparams)
    mask_update_op = p.conditional_mask_update_op()
    increment_global_step = state_ops.assign_add(self.global_step, 1)

    with self.cached_session() as session:
      variables.global_variables_initializer().run()
      for _ in range(110):
        session.run(mask_update_op)
        session.run(increment_global_step)

      self.assertAllEqual(
          session.run(pruning.get_weight_sparsity()), [0.5, 0.75])
  def testFunctionCallInDifferentVariableScopes(self):

    @function.Defun(dtypes.float32)
    def Foo(inputs):
      var = variable_scope.get_variable(
          "var",
          shape=[10],
          dtype=dtypes.float32,
          initializer=init_ops.ones_initializer())
      return inputs + var

    input_op = array_ops.placeholder(shape=[10], dtype=dtypes.float32)
    with variable_scope.variable_scope("vs1"):
      out1_op = Foo(input_op)

    with variable_scope.variable_scope("vs2"):
      out2_op = Foo(input_op)

    global_vars = variables.global_variables()
    self.assertEqual(len(global_vars), 1)
    self.assertEqual(global_vars[0].name, "vs1/var:0")

    with session.Session() as sess:
      sess.run(variables.global_variables_initializer())
      out1, out2 = sess.run(
          [out1_op, out2_op], feed_dict={input_op: np.linspace(1, 10, 10)})
      self.assertAllEqual(out1, np.linspace(2, 11, 10))
      self.assertAllEqual(out2, np.linspace(2, 11, 10))
Example #3
0
 def call(self, inputs, state):
   """Gated recurrent unit (GRU) with nunits cells."""
   with vs.variable_scope("gates"):  # Reset gate and update gate.
     # We start with bias of 1.0 to not reset and not update.
     bias_ones = self._bias_initializer
     if self._bias_initializer is None:
       dtype = inputs.dtype
       bias_ones = init_ops.constant_initializer(1.0, dtype=dtype)
     # pylint: disable=protected-access
     value = math_ops.sigmoid(
         rnn_cell_impl._linear([inputs, state], 2 * self._num_units, True,
                               bias_ones, self._kernel_initializer))
     r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
     # pylint: enable=protected-access
   with vs.variable_scope("candidate"):
     # pylint: disable=protected-access
     with vs.variable_scope("input_projection"):
       hi = rnn_cell_impl._linear(inputs, self._num_units, True,
                                  self._bias_initializer,
                                  self._kernel_initializer)
     with vs.variable_scope("hidden_projection"):
       hh = r * (rnn_cell_impl._linear(state, self._num_units, True,
                                       self._bias_initializer,
                                       self._kernel_initializer))
     # pylint: enable=protected-access
     c = self._activation(hi + hh)
   new_h = u * state + (1 - u) * c
   return new_h, new_h
Example #4
0
  def __call__(self, inputs, state, scope=None):
    """Gated recurrent unit (GRU) with nunits cells."""
    
    with vs.variable_scope(scope or type(self).__name__):
      if self._dropMaskInput.get_shape()[1:] != inputs.get_shape()[1:]:
        print("error: "+str(self._dropMaskInput.get_shape()[1:])+" != "+str(inputs.get_shape()[1:]))
        assert(False)
      if self._dropMaskState.get_shape()[1:] != state.get_shape()[1:]:
        print("error: "+str(self._dropMaskState.get_shape()[1:])+" != "+str(state.get_shape()[1:]))
        assert(False)
      dropin = tf.mul(self._dropMaskInput, inputs)
      dropst = tf.mul(self._dropMaskState, state)

      with vs.variable_scope("Gates"):  # Reset gate and update gate.
        # We start with bias of 1.0 to not reset and not update.
        concat = rnn_cell._linear([dropin, dropst], 2 * self._num_units, True, 1.0)
        r, u = tf.split(1, 2, concat)
        r, u = tf.sigmoid(r), tf.sigmoid(u)

      with vs.variable_scope("Candidate"):
        htilda = self._activation(rnn_cell._linear([dropin, r * dropst], self._num_units, True))

      new_h = u * dropst + (1 - u) * htilda

    return new_h, new_h
  def testInitFromCheckpoint(self):
    checkpoint_dir = self.get_temp_dir()
    with self.test_session() as session:
      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)

    # New graph and session.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as session:
        with variable_scope.variable_scope("some_scope"):
          my1 = variable_scope.get_variable("my1", [1, 10])
          with variable_scope.variable_scope("some_other_scope"):
            my2 = variable_scope.get_variable("my2", [10, 10])
            with variable_scope.variable_scope("other_useful_scope"):
              my4 = variable_scope.get_variable("var4", [9, 9])
        my3 = variable_scope.get_variable("my3", [100, 100])

        checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
            "var1": "some_scope/my1",
            "useful_scope/": "some_scope/some_other_scope/other_useful_scope/",
        })
        checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
            "var2": "some_scope/some_other_scope/my2",
            "var3": my3,
        })

        session.run(variables.global_variables_initializer())
        self.assertAllEqual(my1.eval(session), v1)
        self.assertAllEqual(my2.eval(session), v2)
        self.assertAllEqual(my3.eval(session), v3)
        self.assertAllEqual(my4.eval(session), v4)

        # Check that tensors are not explicitly in the graph.
        self.assertLess(len(str(session.graph.as_graph_def())), 29000)
Example #6
0
 def testGRUCell(self):
   with self.test_session() as sess:
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 2])
       m = array_ops.zeros([1, 2])
       g, _ = rnn_cell_impl.GRUCell(2)(x, m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run(
           [g], {x.name: np.array([[1., 1.]]),
                 m.name: np.array([[0.1, 0.1]])})
       # Smoke test
       self.assertAllClose(res[0], [[0.175991, 0.175991]])
     with variable_scope.variable_scope(
         "other", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros(
           [1, 3])  # Test GRUCell with input_size != num_units.
       m = array_ops.zeros([1, 2])
       g, _ = rnn_cell_impl.GRUCell(2)(x, m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run(
           [g],
           {x.name: np.array([[1., 1., 1.]]),
            m.name: np.array([[0.1, 0.1]])})
       # Smoke test
       self.assertAllClose(res[0], [[0.156736, 0.156736]])
Example #7
0
 def __call__(self, inputs, state, scope=None):
   """Long short-term memory cell with attention (LSTMA)."""
   with vs.variable_scope(scope or type(self).__name__):
     if self._state_is_tuple:
       state, attns, attn_states = state
     else:
       states = state
       state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size])
       attns = array_ops.slice(
           states, [0, self._cell.state_size], [-1, self._attn_size])
       attn_states = array_ops.slice(
           states, [0, self._cell.state_size + self._attn_size],
           [-1, self._attn_size * self._attn_length])
     attn_states = array_ops.reshape(attn_states,
                                     [-1, self._attn_length, self._attn_size])
     input_size = self._input_size
     if input_size is None:
       input_size = inputs.get_shape().as_list()[1]
     inputs = _linear([inputs, attns], input_size, True)
     lstm_output, new_state = self._cell(inputs, state)
     if self._state_is_tuple:
       new_state_cat = array_ops.concat(1, _unpacked_state(new_state))
     else:
       new_state_cat = new_state
     new_attns, new_attn_states = self._attention(new_state_cat, attn_states)
     with vs.variable_scope("AttnOutputProjection"):
       output = _linear([lstm_output, new_attns], self._attn_size, True)
     new_attn_states = array_ops.concat(1, [new_attn_states,
                                            array_ops.expand_dims(output, 1)])
     new_attn_states = array_ops.reshape(
         new_attn_states, [-1, self._attn_length * self._attn_size])
     new_state = (new_state, new_attns, new_attn_states)
     if not self._state_is_tuple:
       new_state = array_ops.concat(1, list(new_state))
     return output, new_state
  def __call__(self, inputs, state, scope=None):
    """Run this RNN cell on inputs, starting from the given state.

    Args:
      inputs: `2-D` tensor with shape `[batch_size, input_size]`.
      state: if `self.state_size` is an integer, this should be a `2-D Tensor`
        with shape `[batch_size, self.state_size]`.  Otherwise, if
        `self.state_size` is a tuple of integers, this should be a tuple
        with shapes `[batch_size, s] for s in self.state_size`.
      scope: VariableScope for the created subgraph; defaults to class name.

    Returns:
      A pair containing:

      - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`.
      - New state: Either a single `2-D` tensor, or a tuple of tensors matching
        the arity and shapes of `state`.
    """
    if scope is not None:
      with vs.variable_scope(scope,
                             custom_getter=self._rnn_get_variable) as scope:
        return super(RNNCell, self).__call__(inputs, state, scope=scope)
    else:
      scope_attrname = "rnncell_scope"
      scope = getattr(self, scope_attrname, None)
      if scope is None:
        scope = vs.variable_scope(vs.get_variable_scope(),
                                  custom_getter=self._rnn_get_variable)
        setattr(self, scope_attrname, scope)
      with scope:
        return super(RNNCell, self).__call__(inputs, state)
Example #9
0
  def testReuse(self):

    def f(x):
      return core_layers.dense(x, self.CHANNELS // 2)

    def g(x):
      return core_layers.dense(x, self.CHANNELS // 2)

    x = random_ops.random_uniform(
        [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
    x1, x2 = array_ops.split(x, 2, axis=-1)

    with variable_scope.variable_scope("test"):
      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)

    num_vars_before = len(variables.global_variables())

    with variable_scope.variable_scope("test", reuse=True):
      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)

    num_vars_after = len(variables.global_variables())
    self.assertEqual(num_vars_before, num_vars_after)

    loss = math_ops.reduce_mean(y1 + y2)
    _ = gradients_impl.gradients(loss,
                                 [x] + variables.trainable_variables())

    with variable_scope.variable_scope("test", reuse=True):
      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)

    num_vars_after = len(variables.global_variables())
    self.assertEqual(num_vars_before, num_vars_after)
Example #10
0
def reduce_to_final(images, num_filters_out, nhidden=None, scope=None):
  """Reduce an image to a final state by running two LSTMs.

  Args:
    images: (num_images, height, width, depth) tensor
    num_filters_out: output layer depth
    nhidden: hidden layer depth (defaults to num_filters_out)
    scope: optional scope name

  Returns:
    A (num_images, num_filters_out) batch.
  """
  with variable_scope.variable_scope(scope, "ReduceToFinal", [images]):
    nhidden = nhidden or num_filters_out
    batch_size, height, width, depth = _shape(images)
    transposed = array_ops.transpose(images, [1, 0, 2, 3])
    reshaped = array_ops.reshape(transposed,
                                 [height, batch_size * width, depth])
    with variable_scope.variable_scope("reduce1"):
      reduced = lstm1d.sequence_to_final(reshaped, nhidden)
      transposed_hidden = array_ops.reshape(reduced,
                                            [batch_size, width, nhidden])
      hidden = array_ops.transpose(transposed_hidden, [1, 0, 2])
    with variable_scope.variable_scope("reduce2"):
      output = lstm1d.sequence_to_final(hidden, num_filters_out)
    return output
Example #11
0
    def call(self, inputs, state, att_score=None):
        """Gated recurrent unit (GRU) with nunits cells."""
        if self._gate_linear is None:
            bias_ones = self._bias_initializer
            if self._bias_initializer is None:
                bias_ones = init_ops.constant_initializer(
                    1.0, dtype=inputs.dtype)
            with vs.variable_scope("gates"):  # Reset gate and update gate.
                self._gate_linear = _Linear(
                    [inputs, state],
                    2 * self._num_units,
                    True,
                    bias_initializer=bias_ones,
                    kernel_initializer=self._kernel_initializer)

        value = math_ops.sigmoid(self._gate_linear([inputs, state]))
        r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)

        r_state = r * state
        if self._candidate_linear is None:
            with vs.variable_scope("candidate"):
                self._candidate_linear = _Linear(
                    [inputs, r_state],
                    self._num_units,
                    True,
                    bias_initializer=self._bias_initializer,
                    kernel_initializer=self._kernel_initializer)
        c = self._activation(self._candidate_linear([inputs, r_state]))
        u = (1.0 - att_score) * u
        new_h = u * state + (1 - u) * c
        return new_h, new_h
Example #12
0
 def _set_scope_for_nonnetwork_sublayer(self, sublayer):
   if sublayer._scope is None:
     if sublayer._first_parent is None:
       constituent_first_parent = None
     else:
       constituent_first_parent = sublayer._first_parent()
     if constituent_first_parent:
       constituent_first_parent._set_scope()
       parent_scope = constituent_first_parent._scope
     else:
       self._finalize_name(False)
       raise ValueError(
           ("The parent of a Layer added to Network %s was garbage collected "
            "before the Layer was built. If this limitation bothers you "
            "please file a feature request.") %
           (self.name,))
     with variable_scope.variable_scope(parent_scope):
       # Horrid hack to make Layer variable names which are direct
       # sub-layers of Networks conform to the Network variable naming
       # conventions.
       with variable_scope.variable_scope(
           None, use_resource=True,
           default_name=sublayer.name) as sub_scope:
         sublayer._scope = sub_scope
         # Also switch op naming for this Layer to match Network conventions,
         # i.e. op naming matching variable naming.
         sublayer._name_scope_name = _network_name_scope_naming
Example #13
0
def separable_lstm(images, num_filters_out,
                   kernel_size=None, nhidden=None, scope=None):
  """Run bidirectional LSTMs first horizontally then vertically.

  Args:
    images: (num_images, height, width, depth) tensor
    num_filters_out: output layer depth
    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
      of the pooling. Can be an int if both values are the same. Set to None for
      not using blocks
    nhidden: hidden layer depth
    scope: optional scope name

  Returns:
    (num_images, height/kernel_height, width/kernel_width,
    num_filters_out) tensor
  """
  with variable_scope.variable_scope(scope, "SeparableLstm", [images]):
    if nhidden is None:
      nhidden = num_filters_out
    if kernel_size is not None:
      images = get_blocks(images, kernel_size)
    hidden = horizontal_lstm(images, nhidden)
    with variable_scope.variable_scope("vertical"):
      transposed = array_ops.transpose(hidden, [0, 2, 1, 3])
      output_transposed = horizontal_lstm(transposed, num_filters_out)
    output = array_ops.transpose(output_transposed, [0, 2, 1, 3])
    return output
Example #14
0
 def _serving_ops(self, features):
   """Add ops for serving to the graph."""
   with variable_scope.variable_scope("model", use_resource=True):
     filtering_features = {}
     prediction_features = {}
     values_length = array_ops.shape(
         features[feature_keys.FilteringFeatures.VALUES])[1]
     for key, value in features.items():
       if key == feature_keys.State.STATE_TUPLE:
         # Ignore state input. The model's default start state is replicated
         # across the batch.
         continue
       if key == feature_keys.FilteringFeatures.VALUES:
         filtering_features[key] = value
       else:
         filtering_features[key] = value[:, :values_length]
         prediction_features[key] = value[:, values_length:]
     cold_filtering_outputs = self.model.define_loss(
         features=filtering_features, mode=estimator_lib.ModeKeys.EVAL)
     prediction_features[feature_keys.State.STATE_TUPLE] = (
         cold_filtering_outputs.end_state)
   with variable_scope.variable_scope("model", reuse=True):
     prediction_outputs = self.model.predict(
         features=prediction_features)
   return estimator_lib.EstimatorSpec(
       mode=estimator_lib.ModeKeys.PREDICT,
       export_outputs={
           feature_keys.SavedModelLabels.PREDICT:
               _NoStatePredictOutput(prediction_outputs),
       },
       # Likely unused, but it is necessary to return `predictions` to satisfy
       # the Estimator's error checking.
       predictions={})
Example #15
0
 def __call__(self, *args, **kwargs):
   if self._variable_scope:
     if self._variables_created:
       # This is not the first visit to __call__, so variables have already
       # been created, and we want to reuse them.
       with variable_scope.variable_scope(self._variable_scope,
                                          reuse=variable_scope.AUTO_REUSE):
         with self._eager_variable_store.as_default():
           return self._call_func(args, kwargs, check_for_new_variables=True)
     else:
       # This is the first visit to __call__, but the scope has already been
       # created in the constructor. Set _variables_created after the inner
       # function is successfully called so that subsequent calls take the if
       # branch above.
       with variable_scope.variable_scope(self._variable_scope,
                                          reuse=variable_scope.AUTO_REUSE):
         with self._eager_variable_store.as_default():
           result = self._call_func(args, kwargs,
                                    check_for_new_variables=False)
       self._variables_created = True
       return result
   else:
     # The scope was not created at construction time, so create it here.
     # Subsequent calls should reuse variables.
     with variable_scope.variable_scope(
         self._unique_name, self._name,
         custom_getter=self._custom_getter) as vs:
       self._variable_scope = vs
       with self._eager_variable_store.as_default():
         result = self._call_func(args, kwargs,
                                  check_for_new_variables=False)
       self._variables_created = True
       return result
Example #16
0
 def testAllowsReuseWithoutPartitioner(self):
   with variable_scope.variable_scope(
       "scope0", partitioner=axis0_into2_partitioner):
     v = variable_scope.get_variable("name0", shape=(3, 1, 1))
   with variable_scope.variable_scope("scope0", reuse=True):
     v_reused = variable_scope.get_variable("name0")
   self.assertEqual(v, v_reused)
Example #17
0
 def _serving_ops(self, features):
   """Add ops for serving to the graph."""
   with variable_scope.variable_scope("model", use_resource=True):
     prediction_outputs = self.model.predict(features=features)
   with variable_scope.variable_scope("model", reuse=True):
     filtering_outputs = self.create_loss(
         features, estimator_lib.ModeKeys.EVAL)
   with variable_scope.variable_scope("model", reuse=True):
     no_state_features = {
         k: v for k, v in features.items()
         if not k.startswith(feature_keys.State.STATE_PREFIX)}
     # Ignore any state management when cold-starting. The model's default
     # start state is replicated across the batch.
     cold_filtering_outputs = self.model.define_loss(
         features=no_state_features, mode=estimator_lib.ModeKeys.EVAL)
   return estimator_lib.EstimatorSpec(
       mode=estimator_lib.ModeKeys.PREDICT,
       export_outputs={
           feature_keys.SavedModelLabels.PREDICT:
               export_lib.PredictOutput(prediction_outputs),
           feature_keys.SavedModelLabels.FILTER:
               export_lib.PredictOutput(
                   state_to_dictionary(filtering_outputs.end_state)),
           feature_keys.SavedModelLabels.COLD_START_FILTER:
               _NoStatePredictOutput(
                   state_to_dictionary(cold_filtering_outputs.end_state))
       },
       # Likely unused, but it is necessary to return `predictions` to satisfy
       # the Estimator's error checking.
       predictions={})
Example #18
0
  def testVarOpScope(self):
    with self.test_session():
      with ops.name_scope("scope1"):
        with variable_scope.variable_scope("tower", "default", []):
          self.assertEqual(
              variable_scope.get_variable("w", []).name, "tower/w:0")
          with ops.name_scope("scope2") as sc2:
            self.assertEqual(sc2, "scope1/tower/scope2/")
        with variable_scope.variable_scope("tower", "default", []):
          with self.assertRaises(ValueError):
            variable_scope.get_variable("w", [])
          with ops.name_scope("scope2") as sc2:
            self.assertEqual(sc2, "scope1/tower_1/scope2/")

      with ops.name_scope("scope2"):
        with variable_scope.variable_scope(None, "default", []):
          self.assertEqual(
              variable_scope.get_variable("w", []).name, "default/w:0")
          with ops.name_scope("scope2") as sc2:
            self.assertEqual(sc2, "scope2/default/scope2/")
        with variable_scope.variable_scope(None, "default", []):
          self.assertEqual(
              variable_scope.get_variable("w", []).name, "default_1/w:0")
          with ops.name_scope("scope2") as sc2:
            self.assertEqual(sc2, "scope2/default_1/scope2/")
Example #19
0
  def testGetCollection(self):
    with self.test_session():
      a = variable_scope.get_variable("a", [])
      b = variable_scope.get_variable("b", [], trainable=False)
      with variable_scope.variable_scope("foo_") as scope1:
        a = variable_scope.get_variable("a", [])
        b = variable_scope.get_variable("b", [], trainable=False)

        self.assertEqual([
            v.name
            for v in scope1.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
        ], ["foo_/a:0"])
        self.assertEqual([
            v.name
            for v in scope1.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        ], ["foo_/a:0", "foo_/b:0"])
      with variable_scope.variable_scope("foo") as scope2:
        a = variable_scope.get_variable("a", [])
        b = variable_scope.get_variable("b", [], trainable=False)
        self.assertEqual([
            v.name
            for v in scope2.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
        ], ["foo/a:0"])
        self.assertEqual([
            v.name
            for v in scope2.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        ], ["foo/a:0", "foo/b:0"])
      scope = variable_scope.get_variable_scope()
      self.assertEqual([
          v.name for v in scope.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
      ], ["a:0", "b:0", "foo_/a:0", "foo_/b:0", "foo/a:0", "foo/b:0"])
      self.assertEqual([
          v.name
          for v in scope.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
      ], ["a:0", "foo_/a:0", "foo/a:0"])
 def testBasicLSTMCell(self):
   for dtype in [dtypes.float16, dtypes.float32]:
     np_dtype = dtype.as_numpy_dtype
     with self.test_session(graph=ops.Graph()) as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2], dtype=dtype)
         m = array_ops.zeros([1, 8], dtype=dtype)
         cell = rnn_cell_impl.MultiRNNCell(
             [
                 rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
                 for _ in range(2)
             ],
             state_is_tuple=False)
         self.assertEqual(cell.dtype, None)
         g, out_m = cell(x, m)
         # Layer infers the input type.
         self.assertEqual(cell.dtype, dtype.name)
         expected_variable_names = [
             "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
             rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
             "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
             rnn_cell_impl._BIAS_VARIABLE_NAME,
             "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
             rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
             "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
             rnn_cell_impl._BIAS_VARIABLE_NAME
         ]
         self.assertEqual(expected_variable_names,
                          [v.name for v in cell.trainable_variables])
         self.assertFalse(cell.non_trainable_variables)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run([g, out_m], {
             x.name: np.array([[1., 1.]]),
             m.name: 0.1 * np.ones([1, 8])
         })
         self.assertEqual(len(res), 2)
         variables = variables_lib.global_variables()
         self.assertEqual(expected_variable_names, [v.name for v in variables])
         # The numbers in results were not calculated, this is just a
         # smoke test.
         self.assertAllClose(res[0], np.array(
             [[0.240, 0.240]], dtype=np_dtype), 1e-2)
         expected_mem = np.array(
             [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
             dtype=np_dtype)
         self.assertAllClose(res[1], expected_mem, 1e-2)
       with variable_scope.variable_scope(
           "other", initializer=init_ops.constant_initializer(0.5)):
         # Test BasicLSTMCell with input_size != num_units.
         x = array_ops.zeros([1, 3], dtype=dtype)
         m = array_ops.zeros([1, 4], dtype=dtype)
         g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g, out_m], {
                 x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
                 m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)
             })
         self.assertEqual(len(res), 2)
Example #21
0
  def testBlockGRUToGRUCellSingleStep(self):
    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
      batch_size = 4
      cell_size = 5
      input_size = 6

      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)

      # Inputs
      x = array_ops.zeros([batch_size, input_size])
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_value = np.random.rand(batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        output = rnn_cell.GRUCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])
        basic_res = sess.run([output], {x: x_value, h: h_value})

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        output = gru_ops.GRUBlockCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])
        block_res = sess.run([output], {x: x_value, h: h_value})

      self.assertEqual(len(block_res), len(basic_res))
      for block, basic in zip(block_res, basic_res):
        self.assertAllClose(block, basic)
Example #22
0
  def testCustomGradientErrorsWithNonResourceVariables(self):

    def F(x, use_resource=False):
      with variable_scope.variable_scope("f", use_resource=use_resource):
        out = core_layers.dense(x, 4, use_bias=False)

      def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
        del out_grad
        self.assertEqual(1, len(variables))
        return (array_ops.ones((3, 2)), [array_ops.ones((2, 4))])

      return out, Grad

    @custom_gradient.custom_gradient
    def FResource(x):
      return F(x, use_resource=True)

    @custom_gradient.custom_gradient
    def FNonResource(x):
      return F(x, use_resource=False)

    x = array_ops.ones((3, 2)) + 2.

    # Wrapping scope has use_resource=True but inner scope sets to False. Fails.
    with variable_scope.variable_scope("vs1", use_resource=True):
      with self.assertRaisesWithPredicateMatch(TypeError,
                                               "must be `ResourceVariable`s"):
        FNonResource(x)

    # Wrapping scope has use_resource=False but inner scope sets to True.
    # Passes.
    with variable_scope.variable_scope("vs2", use_resource=False):
      FResource(x)
Example #23
0
 def __call__(self, *args, **kwargs):
   # In both branches below, the template store is installed as default after
   # the variable scope is opened in order to ensure that templates nested at
   # the same level correctly uniquify lower variable scope names.
   if self._variable_scope:
     # Create a cache for the variable scope context manager the first time
     # around so that we don't have to keep recreating it.
     if not self._variable_scope_context_manager:
       self._variable_scope_context_manager = variable_scope.variable_scope(
           self._variable_scope, reuse=variable_scope.AUTO_REUSE)
     with self._variable_scope_context_manager:
       with self._template_store.as_default():
         result = self._call_func(args, kwargs)
     return result
   else:
     # The scope was not created at construction time, so create it here.
     # Subsequent calls should reuse variables.
     with variable_scope.variable_scope(
         self._unique_name, self._name,
         custom_getter=self._custom_getter) as vs:
       self._variable_scope = vs
       # Because the scope was not created at construction time, the template
       # store's variable scope name is unset; set it here.
       self._template_store.set_variable_scope_name(vs.name)
       with self._template_store.as_default():
         result = self._call_func(args, kwargs)
       return result
 def dnn_logits_fn():
   """Builds the logits from the input layer."""
   previous_layer = input_layer
   for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
     with variable_scope.variable_scope(
         "hiddenlayer_%d" % layer_id,
         values=(previous_layer,)) as hidden_layer_scope:
       net = layers.fully_connected(
           previous_layer,
           num_hidden_units,
           activation_fn=dnn_activation_fn,
           variables_collections=[dnn_parent_scope],
           scope=hidden_layer_scope)
       if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN:
         net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout))
     _add_hidden_layer_summary(net, hidden_layer_scope.name)
     previous_layer = net
   with variable_scope.variable_scope(
       "logits", values=(previous_layer,)) as logits_scope:
     dnn_logits = layers.fully_connected(
         previous_layer,
         head.logits_dimension,
         activation_fn=None,
         variables_collections=[dnn_parent_scope],
         scope=logits_scope)
   _add_hidden_layer_summary(dnn_logits, logits_scope.name)
   return dnn_logits
Example #25
0
  def call(self, inputs, state):
    """
    """
    (c_prev, m_prev) = state
    self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
    scope = vs.get_variable_scope()
    with vs.variable_scope(scope, initializer=self._initializer):
      x = array_ops.concat([inputs, m_prev], axis=1)
      with vs.variable_scope("first_gemm"):
        if self._linear1 is None:
          # no bias for bottleneck
          self._linear1 = _Linear(x, self._fact_size, False)
        R_fact = self._linear1(x)
      with vs.variable_scope("second_gemm"):
        if self._linear2 is None:
          self._linear2 = _Linear(R_fact, 4*self._num_units, True)
        R = self._linear2(R_fact)
      i, j, f, o = array_ops.split(R, 4, 1)

      c = (math_ops.sigmoid(f + self._forget_bias) * c_prev +
           math_ops.sigmoid(i) * math_ops.tanh(j))
      m = math_ops.sigmoid(o) * self._activation(c)

    if self._num_proj is not None:
      with vs.variable_scope("projection"):
        if self._linear3 is None:
          self._linear3 = _Linear(m, self._num_proj, False)
        m = self._linear3(m)

    new_state = rnn_cell_impl.LSTMStateTuple(c, m)
    return m, new_state
Example #26
0
 def testIndyGRUCell(self):
   with self.test_session() as sess:
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 2])
       m = array_ops.zeros([1, 2])
       g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run([g], {
           x.name: np.array([[1., 1.]]),
           m.name: np.array([[0.1, 0.1]])
       })
       # Smoke test
       self.assertAllClose(res[0], [[0.185265, 0.17704]])
     with variable_scope.variable_scope(
         "other", initializer=init_ops.constant_initializer(0.5)):
       # Test IndyGRUCell with input_size != num_units.
       x = array_ops.zeros([1, 3])
       m = array_ops.zeros([1, 2])
       g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run([g], {
           x.name: np.array([[1., 1., 1.]]),
           m.name: np.array([[0.1, 0.1]])
       })
       # Smoke test
       self.assertAllClose(res[0], [[0.155127, 0.157328]])
 def __call__(self, inputs, state, scope=None):
   """Run this multi-layer cell on inputs, starting from state."""
   with vs.variable_scope(scope or "multi_rnn_cell"):
     cur_state_pos = 0
     cur_inp = inputs
     new_states = []
     outputs = []
     for i, cell in enumerate(self._cells):
       with vs.variable_scope("cell_%d" % i):
         if self._state_is_tuple:
           if not nest.is_sequence(state):
             raise ValueError(
                 "Expected state to be a tuple of length %d, but received: %s"
                 % (len(self.state_size), state))
           cur_state = state[i]
         else:
           cur_state = array_ops.slice(
               state, [0, cur_state_pos], [-1, cell.state_size])
           cur_state_pos += cell.state_size
         cur_inp, new_state = cell(cur_inp, cur_state)
         outputs.append(cur_inp)
         new_states.append(new_state)
   new_states = (tuple(new_states) if self._state_is_tuple else
                 array_ops.concat_v2(new_states, 1))
   return tuple(outputs), new_states
Example #28
0
def sequence_softmax(inputs, noutput, scope=None, name=None, linear_name=None):
  """Run a softmax layer over all the time steps of an input sequence.

  Args:
    inputs: (length, batch_size, depth) tensor
    noutput: output depth
    scope: optional scope name
    name: optional name for output tensor
    linear_name: name for linear (pre-softmax) output

  Returns:
    A tensor of size (length, batch_size, noutput).

  """
  length, _, ninputs = _shape(inputs)
  inputs_u = array_ops.unstack(inputs)
  output_u = []
  with variable_scope.variable_scope(scope, "SequenceSoftmax", [inputs]):
    initial_w = random_ops.truncated_normal([0 + ninputs, noutput], stddev=0.1)
    initial_b = constant_op.constant(0.1, shape=[noutput])
    w = variables.model_variable("weights", initializer=initial_w)
    b = variables.model_variable("biases", initializer=initial_b)
    for i in xrange(length):
      with variable_scope.variable_scope(scope, "SequenceSoftmaxStep",
                                         [inputs_u[i]]):
        # TODO(tmb) consider using slim.fully_connected(...,
        # activation_fn=tf.nn.softmax)
        linear = nn_ops.xw_plus_b(inputs_u[i], w, b, name=linear_name)
        output = nn_ops.softmax(linear)
        output_u += [output]
    outputs = array_ops.stack(output_u, name=name)
  return outputs
Example #29
0
def dnn(tensor_in, hidden_units, activation=nn.relu, dropout=None):
  """Creates fully connected deep neural network subgraph.

  Args:
    tensor_in: tensor or placeholder for input features.
    hidden_units: list of counts of hidden units in each layer.
    activation: activation function between layers. Can be None.
    dropout: if not None, will add a dropout layer with given probability.

  Returns:
    A tensor which would be a deep neural network.
  """
  with vs.variable_scope('dnn'):
    for i, n_units in enumerate(hidden_units):
      with vs.variable_scope('layer%d' % i):
        # Weight initializer was set to None to replicate the behavior of
        # rnn_cell.linear. Using fully_connected's default initializer gets
        # slightly worse quality results on unit tests.
        tensor_in = layers.legacy_fully_connected(
            tensor_in,
            n_units,
            weight_init=None,
            weight_collections=['dnn_weights'],
            bias_collections=['dnn_biases'])
        if activation is not None:
          tensor_in = activation(tensor_in)
        if dropout is not None:
          is_training = array_ops_.squeeze(ops.get_collection('IS_TRAINING'))
          tensor_in = control_flow_ops.cond(
              is_training,
              lambda: dropout_ops.dropout(tensor_in, prob=(1.0 - dropout)),
              lambda: tensor_in)
    return tensor_in
Example #30
0
def dnn_autoencoder(
    tensor_in, hidden_units, activation=nn.relu, add_noise=None, dropout=None,
    scope=None):
  """Creates fully connected autoencoder subgraph.

  Args:
    tensor_in: tensor or placeholder for input features.
    hidden_units: list of counts of hidden units in each layer.
    activation: activation function used to map inner latent layer onto
                reconstruction layer.
    add_noise: a function that adds noise to tensor_in,
           e.g. def add_noise(x):
                    return(x + np.random.normal(0, 0.1, (len(x), len(x[0]))))
    dropout: if not None, will add a dropout layer with given
             probability.
    scope: the variable scope for this op.

  Returns:
    Tensors for encoder and decoder.
  """
  with vs.variable_op_scope([tensor_in], scope, "autoencoder"):
    if add_noise is not None:
      tensor_in = add_noise(tensor_in)
    with vs.variable_scope("encoder"):
      # build DNN encoder
      encoder = dnn_ops.dnn(
          tensor_in, hidden_units, activation=activation, dropout=dropout)
    with vs.variable_scope("decoder"):
      # reverse hidden_units and built DNN decoder
      decoder = dnn_ops.dnn(
          encoder, hidden_units[::-1], activation=activation, dropout=dropout)
    return encoder, decoder
Example #31
0
def bidirectional_dynamic_rnn(cell_fw,
                              cell_bw,
                              inputs,
                              sequence_length=None,
                              initial_state_fw=None,
                              initial_state_bw=None,
                              dtype=None,
                              parallel_iterations=None,
                              swap_memory=False,
                              time_major=False,
                              scope=None):
    """Creates a dynamic version of bidirectional recurrent neural network.

  Similar to the unidirectional case above (rnn) but takes input and builds
  independent forward and backward RNNs. The input_size of forward and
  backward cell must match. The initial state for both directions is zero by
  default (but can be set optionally) and no intermediate states are ever
  returned -- the network is fully unrolled for the given (passed in)
  length(s) of the sequence(s) or completely unrolled if length(s) is not
  given.

  Args:
    cell_fw: An instance of RNNCell, to be used for forward direction.
    cell_bw: An instance of RNNCell, to be used for backward direction.
    inputs: The RNN inputs.
      If time_major == False (default), this must be a tensor of shape:
        `[batch_size, max_time, input_size]`.
      If time_major == True, this must be a tensor of shape:
        `[max_time, batch_size, input_size]`.
      [batch_size, input_size].
    sequence_length: An int32/int64 vector, size `[batch_size]`,
      containing the actual lengths for each of the sequences.
    initial_state_fw: (optional) An initial state for the forward RNN.
      This must be a tensor of appropriate type and shape
      `[batch_size, cell_fw.state_size]`.
      If `cell_fw.state_size` is a tuple, this should be a tuple of
      tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
    initial_state_bw: (optional) Same as for `initial_state_fw`, but using
      the corresponding properties of `cell_bw`.
    dtype: (optional) The data type for the initial states and expected output.
      Required if initial_states are not provided or RNN states have a
      heterogeneous dtype.
    parallel_iterations: (Default: 32).  The number of iterations to run in
      parallel.  Those operations which do not have any temporal dependency
      and can be run in parallel, will be.  This parameter trades off
      time for space.  Values >> 1 use more memory but take less time,
      while smaller values use less memory but computations take longer.
    swap_memory: Transparently swap the tensors produced in forward inference
      but needed for back prop from GPU to CPU.  This allows training RNNs
      which would typically not fit on a single GPU, with very minimal (or no)
      performance penalty.
    time_major: The shape format of the `inputs` and `outputs` Tensors.
      If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
      If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
      Using `time_major = True` is a bit more efficient because it avoids
      transposes at the beginning and end of the RNN calculation.  However,
      most TensorFlow data is batch-major, so by default this function
      accepts input and emits output in batch-major form.
    dtype: (optional) The data type for the initial state.  Required if
      either of the initial states are not provided.
    scope: VariableScope for the created subgraph; defaults to
      "bidirectional_rnn"

  Returns:
    A tuple (outputs, output_states) where:
      outputs: A tuple (output_fw, output_bw) containing the forward and
        the backward rnn output `Tensor`.
        If time_major == False (default),
          output_fw will be a `Tensor` shaped:
          `[batch_size, max_time, cell_fw.output_size]`
          and output_bw will be a `Tensor` shaped:
          `[batch_size, max_time, cell_bw.output_size]`.
        If time_major == True,
          output_fw will be a `Tensor` shaped:
          `[max_time, batch_size, cell_fw.output_size]`
          and output_bw will be a `Tensor` shaped:
          `[max_time, batch_size, cell_bw.output_size]`.
        It returns a tuple instead of a single concatenated `Tensor`, unlike
        in the `bidirectional_rnn`. If the concatenated one is preferred,
        the forward and backward outputs can be concatenated as
        `tf.concat_v2(outputs, 2)`.
      output_states: A tuple (output_state_fw, output_state_bw) containing
        the forward and the backward final states of bidirectional rnn.

  Raises:
    TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
  """

    # pylint: disable=protected-access
    if not isinstance(cell_fw, rnn_cell_impl._RNNCell):
        raise TypeError("cell_fw must be an instance of RNNCell")
    if not isinstance(cell_bw, rnn_cell_impl._RNNCell):
        raise TypeError("cell_bw must be an instance of RNNCell")
    # pylint: enable=protected-access

    with vs.variable_scope(scope or "bidirectional_rnn"):
        # Forward direction
        with vs.variable_scope("fw") as fw_scope:
            output_fw, output_state_fw = dynamic_rnn(
                cell=cell_fw,
                inputs=inputs,
                sequence_length=sequence_length,
                initial_state=initial_state_fw,
                dtype=dtype,
                parallel_iterations=parallel_iterations,
                swap_memory=swap_memory,
                time_major=time_major,
                scope=fw_scope)

        # Backward direction
        if not time_major:
            time_dim = 1
            batch_dim = 0
        else:
            time_dim = 0
            batch_dim = 1

        with vs.variable_scope("bw") as bw_scope:
            inputs_reverse = array_ops.reverse_sequence(
                input=inputs,
                seq_lengths=sequence_length,
                seq_dim=time_dim,
                batch_dim=batch_dim)
            tmp, output_state_bw = dynamic_rnn(
                cell=cell_bw,
                inputs=inputs_reverse,
                sequence_length=sequence_length,
                initial_state=initial_state_bw,
                dtype=dtype,
                parallel_iterations=parallel_iterations,
                swap_memory=swap_memory,
                time_major=time_major,
                scope=bw_scope)

    output_bw = array_ops.reverse_sequence(input=tmp,
                                           seq_lengths=sequence_length,
                                           seq_dim=time_dim,
                                           batch_dim=batch_dim)

    outputs = (output_fw, output_bw)
    output_states = (output_state_fw, output_state_bw)

    return (outputs, output_states)
Example #32
0
    def encode(self,
               inputs,
               sequence_lengths,
               masks=None,
               encoder_state_input=None):
        """
        In a generalized encode function, you pass in your inputs,
        masks, and an initial
        hidden state input into this function.

        :param inputs: Symbolic representations of your input
        : param passage_sequence_lengths: This is the sequence length for each passage in the batch.
            They're all the same and correspond to max_length_passage
        :param masks: this is to make sure tf.nn.dynamic_rnn doesn't iterate
                      through masked steps
        :param encoder_state_input: (Optional) pass this as initial hidden state
                                    to tf.nn.dynamic_rnn to build conditional representations
        :return: an encoded representation of your input.
                 It can be context-level representation, word-level representation,
                 or both.
        """

        # Inputs is tuple
        # Inputs = (passages_batch, questions_batch)
        passages, questions = inputs
        # Sequence lengths is tuple
        # Sequence_lengths = (passage_lengths, question_lengths)
        passage_sequence_lengths, question_sequence_lengths = sequence_lengths

        # We assume passages_batch is (None, max_length_passage, embedding dim) and represents the word embedding of the passages
        # We assume questions_batch is (None, max_length_question, embedding dim) and represents the word embedding of the questions
        # Each index in the second dimension represents the word at that index
        # TODO: add mask if we want to use the final state. step-by step is probably chill
        # See: https://piazza.com/class/iw9g8b9yxp46s8?cid=2153

        # Our model is the following:
        #   run bid-rectional LSTM over the passage. Concatenate forward and backward vectors at each word/time-step
        #   run bid-rectional LSTM over the question. Concatenate forward and backward vectors for the last word
        #   for each time-step in passage, concatenate state vector with the vector above

        # Generate bi-lstm for passage
        with vs.variable_scope("Passage-Bi-LSTM"):
            # First pass, we just want to run a bi lateral LSTM over each passage in the batch
            # Create forward direction cell
            with vs.variable_scope('forward'):
                p_lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(self.size,
                                                         forget_bias=1.0,
                                                         state_is_tuple=True)
            # Create backward cell
            with vs.variable_scope('backward'):
                p_lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(self.size,
                                                         forget_bias=1.0,
                                                         state_is_tuple=True)

            # Create bilateral LSTM

            p_outputs, p_output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=p_lstm_fw_cell, cell_bw = p_lstm_bw_cell, \
                inputs = passages,  dtype=tf.float64, scope="Passage-Bi-LSTM", sequence_length=passage_sequence_lengths)
            # Concatenate the output_fw and output_bw at each time-step for each input in batch
            # Outputs[0] corresponds to the forward output state at each time step
            # Outputs[1] corresponds to the backward otuput state at each time step
            p_concat_outputs = tf.concat(2, [p_outputs[0], p_outputs[1]])

        # Generate bi-lstm for question
        with vs.variable_scope("question-Bi-LSTM"):
            # First pass, we just want to run a bi lateral LSTM over each question in the batch
            # Create forward direction cell
            with vs.variable_scope('forward'):
                q_lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(self.size,
                                                         forget_bias=1.0,
                                                         state_is_tuple=True)
            # Create backward cell
            with vs.variable_scope('backward'):
                q_lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(self.size,
                                                         forget_bias=1.0,
                                                         state_is_tuple=True)
            # Create bilateral LSTM
            q_outputs, q_output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=q_lstm_fw_cell, cell_bw = q_lstm_bw_cell, \
                inputs = questions,  dtype=tf.float64, scope="question-Bi-LSTM", sequence_length=question_sequence_lengths)

            # Only concat the forward state for the last time step and backward state for first time step
            # Outputs[0] corresponds to the forward output state at each time step
            # Outputs[1] corresponds to the backward otuput state at each time step
            final_word_question = tf.concat(
                1, [q_outputs[0][:, -1, :], q_outputs[1][:, 0, :]])

        # For each word/time-step, we now concatenate with the bi-lstm representation of the last word in the associated question
        # TODO: double check this is what we want to do
        # First, we need to expand the dimension of final_word_question i.e. add a dimension in the middle for each time step
        final_word_question = tf.expand_dims(final_word_question, 1)
        # Now we multiple the middle dimension for each word in the passage
        passage_length = passages.get_shape()[1]

        max_passage_len = tf.shape(passages)[1]
        final_word_question = tf.tile(final_word_question,
                                      multiples=[1, max_passage_len, 1])

        # Now we concatenate. We want each vector for each word/time-step to get the same vector concatenated
        final_concat = tf.concat(2, [p_concat_outputs, final_word_question])

        # We return the concatenated bidirectional LSTM output for each word in the passage i.e. each time step
        # Should return (batch_size, max_length_passage, 4*hidden_size) (assuming all hidden sizes same)
        return final_concat
Example #33
0
    def decode(self, knowledge_rep, sequence_lengths):
        """
        takes in a knowledge representation
        and output a probability estimation over
        all paragraph tokens on which token should be
        the start of the answer span, and which should be
        the end of the answer span.

        :param knowledge_rep: a Tensor of size (batch_size, max_length_passage, knowledge_size)
        :return:
        """

        # Basic Prediction Layer
        # override output_size for now, since this is only the softmax layer
        # We assume knowledge_rep is (batch_size, max_length_passage, XXx)
        # We convert to (batch_size, max_length_passage, 2) where we output probabilities for being in the answer or not
        self.output_size = 2
        outputs = []

        passage_sequence_lengths, question_sequence_lengths = sequence_lengths

        # Run Knowledge rep through bi-directional LSTM
        with tf.variable_scope("Decode-Bi-LSTM"):
            batch_size, max_length, knowledge_size = knowledge_rep.get_shape(
            ).as_list()

            # Create forward cell
            with vs.variable_scope('forward'):
                d_lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(knowledge_size,
                                                         forget_bias=1.0,
                                                         state_is_tuple=True)
            # Create backward cell
            with vs.variable_scope('backward'):
                d_lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(knowledge_size,
                                                         forget_bias=1.0,
                                                         state_is_tuple=True)

            # Create bi-directional LSTM
            d_outputs, d_output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=d_lstm_fw_cell, cell_bw = d_lstm_bw_cell, \
                inputs=knowledge_rep, dtype=tf.float64, scope="Decode-Bi-LSTM", sequence_length=passage_sequence_lengths)

            d_outputs_concat = tf.concat(2, [d_outputs[0], d_outputs[1]])

        # compute predictions as y' = sofmax(xU + b)
        with tf.variable_scope("Decode-Prediction"):

            # Create weight matrix
            U = tf.get_variable(
                "U", [knowledge_size * 2, self.output_size],
                dtype=tf.float64,
                initializer=tf.contrib.layers.xavier_initializer())

            # Create bias vector
            b = tf.get_variable("b", [1, self.output_size],
                                dtype=tf.float64,
                                initializer=tf.constant_initializer(0.0))

        max_len_passage = tf.shape(knowledge_rep)[1]

        # Since max_len_passage is dynamically computed, we cannot iterate over
        # every time step in the tensor d_outputs_concat in order to compute the prediction.
        # Instead, we reshape the tensor into a 2D matrix so that we can compute
        # predictions for all time steps with one matrix multiplication.
        # NOTE: Assuming tf.reshape unrolls dimensions in the exact opposite order
        # it rolls dimensions (initial tests on numpy seem to indicate this)
        d_outputs_reshaped = tf.reshape(d_outputs_concat,
                                        [-1, knowledge_size * 2])

        outputs = tf.matmul(d_outputs_reshaped, U) + b

        # Our outputs are of size (batch_size*max_len_passage, output_size), we needed
        # to reshape them back so they are grouped by timestep
        outputs = tf.reshape(outputs, [-1, max_len_passage, self.output_size])
        return outputs
Example #34
0
    def _line_sep(self,
                  args,
                  output_size,
                  bias,
                  bias_initializer=None,
                  kernel_initializer=None):
        if args is None or (nest.is_sequence(args) and not args):
            raise ValueError("`args` must be specified")
        if not nest.is_sequence(args):
            args = [args]

        # Calculate the total size of arguments on dimension 1.
        total_arg_size = 0
        shapes = [a.get_shape() for a in args]
        for shape in shapes:
            if shape.ndims != 2:
                raise ValueError("linear is expecting 2D arguments: %s" %
                                 shapes)
            if shape[1].value is None:
                raise ValueError("linear expects shape[1] to \
	                            be provided for shape %s, "
                                 "but saw %s" % (shape, shape[1]))
            else:
                total_arg_size += shape[1].value

        dtype = [a.dtype for a in args][0]

        # Now the computation.
        scope = vs.get_variable_scope()
        with vs.variable_scope(scope) as outer_scope:

            [x, h] = args

            x_size = x.get_shape().as_list()[1]
            W_xh = tf.get_variable('W_xh', [x_size, output_size],
                                   initializer=weights_initializer)
            W_hh = tf.get_variable('W_hh', [int(output_size / 4), output_size],
                                   initializer=weights_initializer)

            #x = tf.Print(x,[tf.reduce_mean(x)], str(scope)+'x: ')
            #h = tf.Print(h,[tf.reduce_mean(h)], str(scope)+'h: ')

            #W_xh = tf.Print(W_xh,[tf.reduce_mean(W_xh)], str(scope)+'W_xh: ')
            #W_hh = tf.Print(W_hh,[tf.reduce_mean(W_hh)], str(scope)+'W_hh: ')

            cn_xh = self.cosine_norm(x, W_xh, 'cn_xh')  # one hot vector
            cn_hh = self.cosine_norm(h, W_hh, 'cn_hh')

            #cn_xh = tf.Print(cn_xh,[tf.reduce_mean(cn_xh)], str(scope)+'cn_xh: ')
            #cn_hh = tf.Print(cn_hh,[tf.reduce_mean(cn_hh)], str(scope)+'cn_hh: ')

            res = cn_xh + cn_hh

            if not bias:
                return res
            with vs.variable_scope(outer_scope) as inner_scope:
                inner_scope.set_partitioner(None)
                if bias_initializer is None:
                    bias_initializer = init_ops.constant_initializer(
                        0.0, dtype=dtype)
                biases = vs.get_variable(_BIAS_VARIABLE_NAME, [output_size],
                                         dtype=dtype,
                                         initializer=bias_initializer)
            return nn_ops.bias_add(res, biases)
Example #35
0
    def testAddVariable(self):
        obj = NonLayerTrackable()
        with self.assertRaisesRegex(ValueError, "do not specify shape"):
            trackable_utils.add_variable(obj,
                                         name="shape_specified_twice",
                                         shape=[],
                                         initializer=1)
        constant_initializer = trackable_utils.add_variable(
            obj, name="constant_initializer", initializer=1)
        with variable_scope.variable_scope("some_variable_scope"):
            ones_initializer = trackable_utils.add_variable(
                obj,
                name="ones_initializer",
                shape=[2],
                initializer=init_ops.ones_initializer(dtype=dtypes.float32))
        bare_initializer = trackable_utils.add_variable(
            obj,
            name="bare_initializer",
            shape=[2, 2],
            dtype=dtypes.float64,
            initializer=init_ops.zeros_initializer)

        # Even in graph mode, there are no naming conflicts between objects, only
        # naming conflicts within an object.
        other_duplicate = resource_variable_ops.ResourceVariable(
            name="duplicate", initial_value=1.)
        duplicate = trackable_utils.add_variable(obj,
                                                 name="duplicate",
                                                 shape=[])
        with self.assertRaisesRegex(ValueError,
                                    "'duplicate'.*already declared"):
            trackable_utils.add_variable(obj, name="duplicate", shape=[])

        self.evaluate(trackable_utils.gather_initializers(obj))
        self.assertEqual("constant_initializer:0", constant_initializer.name)
        self.assertEqual(1, self.evaluate(constant_initializer))
        self.assertEqual("some_variable_scope/ones_initializer:0",
                         ones_initializer.name)
        self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
        self.assertAllEqual([[0., 0.], [0., 0.]],
                            self.evaluate(bare_initializer))
        self.assertEqual("a_variable:0", obj.a_variable.name)
        self.assertEqual("duplicate:0", other_duplicate.name)
        if context.executing_eagerly():
            # When executing eagerly, there's no uniquification of variable names. The
            # checkpoint name will be the same.
            self.assertEqual("duplicate:0", duplicate.name)
        else:
            # The .name attribute may be globally influenced, but the checkpoint name
            # won't be (tested below).
            self.assertEqual("duplicate_1:0", duplicate.name)
        named_variables, _, _ = (
            graph_view.ObjectGraphView(obj).serialize_object_graph())
        expected_checkpoint_names = (
            "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
            "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
            "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE",
            "duplicate/.ATTRIBUTES/VARIABLE_VALUE",
            "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
        )
        six.assertCountEqual(self, expected_checkpoint_names,
                             [v.name for v in named_variables])
Example #36
0
def raw_rnn(cell,
            loop_fn,
            parallel_iterations=None,
            swap_memory=False,
            scope=None):
    """Creates an `RNN` specified by RNNCell `cell` and loop function `loop_fn`.

  **NOTE: This method is still in testing, and the API may change.**

  This function is a more primitive version of `dynamic_rnn` that provides
  more direct access to the inputs each iteration.  It also provides more
  control over when to start and finish reading the sequence, and
  what to emit for the output.

  For example, it can be used to implement the dynamic decoder of a seq2seq
  model.

  Instead of working with `Tensor` objects, most operations work with
  `TensorArray` objects directly.

  The operation of `raw_rnn`, in pseudo-code, is basically the following:

  ```python
  time = tf.constant(0, dtype=tf.int32)
  (finished, next_input, initial_state, _, loop_state) = loop_fn(
      time=time, cell_output=None, cell_state=None, loop_state=None)
  emit_ta = TensorArray(dynamic_size=True, dtype=initial_state.dtype)
  state = initial_state
  while not all(finished):
    (output, cell_state) = cell(next_input, state)
    (next_finished, next_input, next_state, emit, loop_state) = loop_fn(
        time=time + 1, cell_output=output, cell_state=cell_state,
        loop_state=loop_state)
    # Emit zeros and copy forward state for minibatch entries that are finished.
    state = tf.where(finished, state, next_state)
    emit = tf.where(finished, tf.zeros_like(emit), emit)
    emit_ta = emit_ta.write(time, emit)
    # If any new minibatch entries are marked as finished, mark these.
    finished = tf.logical_or(finished, next_finished)
    time += 1
  return (emit_ta, state, loop_state)
  ```

  with the additional properties that output and state may be (possibly nested)
  tuples, as determined by `cell.output_size` and `cell.state_size`, and
  as a result the final `state` and `emit_ta` may themselves be tuples.

  A simple implementation of `dynamic_rnn` via `raw_rnn` looks like this:

  ```python
  inputs = tf.placeholder(shape=(max_time, batch_size, input_depth),
                          dtype=tf.float32)
  sequence_length = tf.placeholder(shape=(batch_size,), dtype=tf.int32)
  inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
  inputs_ta = inputs_ta.unstack(inputs)

  cell = tf.contrib.rnn.LSTMCell(num_units)

  def loop_fn(time, cell_output, cell_state, loop_state):
    emit_output = cell_output  # == None for time == 0
    if cell_output is None:  # time == 0
      next_cell_state = cell.zero_state(batch_size, tf.float32)
    else:
      next_cell_state = cell_state
    elements_finished = (time >= sequence_length)
    finished = tf.reduce_all(elements_finished)
    next_input = tf.cond(
        finished,
        lambda: tf.zeros([batch_size, input_depth], dtype=tf.float32),
        lambda: inputs_ta.read(time))
    next_loop_state = None
    return (elements_finished, next_input, next_cell_state,
            emit_output, next_loop_state)

  outputs_ta, final_state, _ = raw_rnn(cell, loop_fn)
  outputs = outputs_ta.stack()
  ```

  Args:
    cell: An instance of RNNCell.
    loop_fn: A callable that takes inputs
      `(time, cell_output, cell_state, loop_state)`
      and returns the tuple
      `(finished, next_input, next_cell_state, emit_output, next_loop_state)`.
      Here `time` is an int32 scalar `Tensor`, `cell_output` is a
      `Tensor` or (possibly nested) tuple of tensors as determined by
      `cell.output_size`, and `cell_state` is a `Tensor`
      or (possibly nested) tuple of tensors, as determined by the `loop_fn`
      on its first call (and should match `cell.state_size`).
      The outputs are: `finished`, a boolean `Tensor` of
      shape `[batch_size]`, `next_input`: the next input to feed to `cell`,
      `next_cell_state`: the next state to feed to `cell`,
      and `emit_output`: the output to store for this iteration.

      Note that `emit_output` should be a `Tensor` or (possibly nested)
      tuple of tensors with shapes and structure matching `cell.output_size`
      and `cell_output` above.  The parameter `cell_state` and output
      `next_cell_state` may be either a single or (possibly nested) tuple
      of tensors.  The parameter `loop_state` and
      output `next_loop_state` may be either a single or (possibly nested) tuple
      of `Tensor` and `TensorArray` objects.  This last parameter
      may be ignored by `loop_fn` and the return value may be `None`.  If it
      is not `None`, then the `loop_state` will be propagated through the RNN
      loop, for use purely by `loop_fn` to keep track of its own state.
      The `next_loop_state` parameter returned may be `None`.

      The first call to `loop_fn` will be `time = 0`, `cell_output = None`,
      `cell_state = None`, and `loop_state = None`.  For this call:
      The `next_cell_state` value should be the value with which to initialize
      the cell's state.  It may be a final state from a previous RNN or it
      may be the output of `cell.zero_state()`.  It should be a
      (possibly nested) tuple structure of tensors.
      If `cell.state_size` is an integer, this must be
      a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
      If `cell.state_size` is a `TensorShape`, this must be a `Tensor` of
      appropriate type and shape `[batch_size] + cell.state_size`.
      If `cell.state_size` is a (possibly nested) tuple of ints or
      `TensorShape`, this will be a tuple having the corresponding shapes.
      The `emit_output` value may be  either `None` or a (possibly nested)
      tuple structure of tensors, e.g.,
      `(tf.zeros(shape_0, dtype=dtype_0), tf.zeros(shape_1, dtype=dtype_1))`.
      If this first `emit_output` return value is `None`,
      then the `emit_ta` result of `raw_rnn` will have the same structure and
      dtypes as `cell.output_size`.  Otherwise `emit_ta` will have the same
      structure, shapes (prepended with a `batch_size` dimension), and dtypes
      as `emit_output`.  The actual values returned for `emit_output` at this
      initializing call are ignored.  Note, this emit structure must be
      consistent across all time steps.

    parallel_iterations: (Default: 32).  The number of iterations to run in
      parallel.  Those operations which do not have any temporal dependency
      and can be run in parallel, will be.  This parameter trades off
      time for space.  Values >> 1 use more memory but take less time,
      while smaller values use less memory but computations take longer.
    swap_memory: Transparently swap the tensors produced in forward inference
      but needed for back prop from GPU to CPU.  This allows training RNNs
      which would typically not fit on a single GPU, with very minimal (or no)
      performance penalty.
    scope: VariableScope for the created subgraph; defaults to "rnn".

  Returns:
    A tuple `(emit_ta, final_state, final_loop_state)` where:

    `emit_ta`: The RNN output `TensorArray`.
       If `loop_fn` returns a (possibly nested) set of Tensors for
       `emit_output` during initialization, (inputs `time = 0`,
       `cell_output = None`, and `loop_state = None`), then `emit_ta` will
       have the same structure, dtypes, and shapes as `emit_output` instead.
       If `loop_fn` returns `emit_output = None` during this call,
       the structure of `cell.output_size` is used:
       If `cell.output_size` is a (possibly nested) tuple of integers
       or `TensorShape` objects, then `emit_ta` will be a tuple having the
       same structure as `cell.output_size`, containing TensorArrays whose
       elements' shapes correspond to the shape data in `cell.output_size`.

    `final_state`: The final cell state.  If `cell.state_size` is an int, this
      will be shaped `[batch_size, cell.state_size]`.  If it is a
      `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
      If it is a (possibly nested) tuple of ints or `TensorShape`, this will
      be a tuple having the corresponding shapes.

    `final_loop_state`: The final loop state as returned by `loop_fn`.

  Raises:
    TypeError: If `cell` is not an instance of RNNCell, or `loop_fn` is not
      a `callable`.
  """

    # pylint: disable=protected-access
    if not isinstance(cell, rnn_cell_impl._RNNCell):
        raise TypeError("cell must be an instance of RNNCell")
    # pylint: enable=protected-access
    if not callable(loop_fn):
        raise TypeError("loop_fn must be a callable")

    parallel_iterations = parallel_iterations or 32

    # Create a new scope in which the caching device is either
    # determined by the parent scope, or is set to place the cached
    # Variable using the same placement as for the rest of the RNN.
    with vs.variable_scope(scope or "rnn") as varscope:
        if varscope.caching_device is None:
            varscope.set_caching_device(lambda op: op.device)

        time = constant_op.constant(0, dtype=dtypes.int32)
        (elements_finished, next_input,
         initial_state, emit_structure, init_loop_state) = loop_fn(
             time, None, None,
             None)  # time, cell_output, cell_state, loop_state
        flat_input = nest.flatten(next_input)

        # Need a surrogate loop state for the while_loop if none is available.
        loop_state = (init_loop_state if init_loop_state is not None else
                      constant_op.constant(0, dtype=dtypes.int32))

        input_shape = [input_.get_shape() for input_ in flat_input]
        static_batch_size = input_shape[0][0]

        for input_shape_i in input_shape:
            # Static verification that batch sizes all match
            static_batch_size.merge_with(input_shape_i[0])

        batch_size = static_batch_size.value
        if batch_size is None:
            batch_size = array_ops.shape(flat_input[0])[0]

        nest.assert_same_structure(initial_state, cell.state_size)
        state = initial_state
        flat_state = nest.flatten(state)
        flat_state = [ops.convert_to_tensor(s) for s in flat_state]
        state = nest.pack_sequence_as(structure=state,
                                      flat_sequence=flat_state)

        if emit_structure is not None:
            flat_emit_structure = nest.flatten(emit_structure)
            flat_emit_size = [emit.get_shape() for emit in flat_emit_structure]
            flat_emit_dtypes = [emit.dtype for emit in flat_emit_structure]
        else:
            emit_structure = cell.output_size
            flat_emit_size = nest.flatten(emit_structure)
            flat_emit_dtypes = [flat_state[0].dtype] * len(flat_emit_size)

        flat_emit_ta = [
            tensor_array_ops.TensorArray(dtype=dtype_i,
                                         dynamic_size=True,
                                         size=0,
                                         name="rnn_output_%d" % i)
            for i, dtype_i in enumerate(flat_emit_dtypes)
        ]
        emit_ta = nest.pack_sequence_as(structure=emit_structure,
                                        flat_sequence=flat_emit_ta)
        flat_zero_emit = [
            array_ops.zeros(
                _state_size_with_prefix(size_i, prefix=[batch_size]), dtype_i)
            for size_i, dtype_i in zip(flat_emit_size, flat_emit_dtypes)
        ]
        zero_emit = nest.pack_sequence_as(structure=emit_structure,
                                          flat_sequence=flat_zero_emit)

        def condition(unused_time, elements_finished, *_):
            return math_ops.logical_not(math_ops.reduce_all(elements_finished))

        def body(time, elements_finished, current_input, emit_ta, state,
                 loop_state):
            """Internal while loop body for raw_rnn.

      Args:
        time: time scalar.
        elements_finished: batch-size vector.
        current_input: possibly nested tuple of input tensors.
        emit_ta: possibly nested tuple of output TensorArrays.
        state: possibly nested tuple of state tensors.
        loop_state: possibly nested tuple of loop state tensors.

      Returns:
        Tuple having the same size as Args but with updated values.
      """
            (next_output, cell_state) = cell(current_input, state)

            nest.assert_same_structure(state, cell_state)
            nest.assert_same_structure(cell.output_size, next_output)

            next_time = time + 1
            (next_finished, next_input, next_state, emit_output,
             next_loop_state) = loop_fn(next_time, next_output, cell_state,
                                        loop_state)

            nest.assert_same_structure(state, next_state)
            nest.assert_same_structure(current_input, next_input)
            nest.assert_same_structure(emit_ta, emit_output)

            # If loop_fn returns None for next_loop_state, just reuse the
            # previous one.
            loop_state = loop_state if next_loop_state is None else next_loop_state

            def _copy_some_through(current, candidate):
                """Copy some tensors through via array_ops.where."""
                current_flat = nest.flatten(current)
                candidate_flat = nest.flatten(candidate)
                # pylint: disable=g-long-lambda,cell-var-from-loop
                result_flat = [
                    _on_device(lambda: array_ops.where(elements_finished,
                                                       current_i, candidate_i),
                               device=candidate_i.op.device)
                    for (current_i,
                         candidate_i) in zip(current_flat, candidate_flat)
                ]
                # pylint: enable=g-long-lambda,cell-var-from-loop
                return nest.pack_sequence_as(structure=current,
                                             flat_sequence=result_flat)

            emit_output = _copy_some_through(zero_emit, emit_output)
            next_state = _copy_some_through(state, next_state)

            emit_output_flat = nest.flatten(emit_output)
            emit_ta_flat = nest.flatten(emit_ta)

            elements_finished = math_ops.logical_or(elements_finished,
                                                    next_finished)

            emit_ta_flat = [
                ta.write(time, emit)
                for (ta, emit) in zip(emit_ta_flat, emit_output_flat)
            ]

            emit_ta = nest.pack_sequence_as(structure=emit_structure,
                                            flat_sequence=emit_ta_flat)

            return (next_time, elements_finished, next_input, emit_ta,
                    next_state, loop_state)

        returned = control_flow_ops.while_loop(
            condition,
            body,
            loop_vars=[
                time, elements_finished, next_input, emit_ta, state, loop_state
            ],
            parallel_iterations=parallel_iterations,
            swap_memory=swap_memory)

        (emit_ta, final_state, final_loop_state) = returned[-3:]

        if init_loop_state is None:
            final_loop_state = None

        return (emit_ta, final_state, final_loop_state)
Example #37
0
def dynamic_rnn(cell,
                inputs,
                sequence_length=None,
                initial_state=None,
                dtype=None,
                parallel_iterations=None,
                swap_memory=False,
                time_major=False,
                scope=None):
    """Creates a recurrent neural network specified by RNNCell `cell`.

  This function is functionally identical to the function `rnn` above, but
  performs fully dynamic unrolling of `inputs`.

  Unlike `rnn`, the input `inputs` is not a Python list of `Tensors`, one for
  each frame.  Instead, `inputs` may be a single `Tensor` where
  the maximum time is either the first or second dimension (see the parameter
  `time_major`).  Alternatively, it may be a (possibly nested) tuple of
  Tensors, each of them having matching batch and time dimensions.
  The corresponding output is either a single `Tensor` having the same number
  of time steps and batch size, or a (possibly nested) tuple of such tensors,
  matching the nested structure of `cell.output_size`.

  The parameter `sequence_length` is optional and is used to copy-through state
  and zero-out outputs when past a batch element's sequence length. So it's more
  for correctness than performance, unlike in rnn().

  Args:
    cell: An instance of RNNCell.
    inputs: The RNN inputs.

      If `time_major == False` (default), this must be a `Tensor` of shape:
        `[batch_size, max_time, ...]`, or a nested tuple of such
        elements.

      If `time_major == True`, this must be a `Tensor` of shape:
        `[max_time, batch_size, ...]`, or a nested tuple of such
        elements.

      This may also be a (possibly nested) tuple of Tensors satisfying
      this property.  The first two dimensions must match across all the inputs,
      but otherwise the ranks and other shape components may differ.
      In this case, input to `cell` at each time-step will replicate the
      structure of these tuples, except for the time dimension (from which the
      time is taken).

      The input to `cell` at each time step will be a `Tensor` or (possibly
      nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
    initial_state: (optional) An initial state for the RNN.
      If `cell.state_size` is an integer, this must be
      a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
      If `cell.state_size` is a tuple, this should be a tuple of
      tensors having shapes `[batch_size, s] for s in cell.state_size`.
    dtype: (optional) The data type for the initial state and expected output.
      Required if initial_state is not provided or RNN state has a heterogeneous
      dtype.
    parallel_iterations: (Default: 32).  The number of iterations to run in
      parallel.  Those operations which do not have any temporal dependency
      and can be run in parallel, will be.  This parameter trades off
      time for space.  Values >> 1 use more memory but take less time,
      while smaller values use less memory but computations take longer.
    swap_memory: Transparently swap the tensors produced in forward inference
      but needed for back prop from GPU to CPU.  This allows training RNNs
      which would typically not fit on a single GPU, with very minimal (or no)
      performance penalty.
    time_major: The shape format of the `inputs` and `outputs` Tensors.
      If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
      If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
      Using `time_major = True` is a bit more efficient because it avoids
      transposes at the beginning and end of the RNN calculation.  However,
      most TensorFlow data is batch-major, so by default this function
      accepts input and emits output in batch-major form.
    scope: VariableScope for the created subgraph; defaults to "rnn".

  Returns:
    A pair (outputs, state) where:

      outputs: The RNN output `Tensor`.

        If time_major == False (default), this will be a `Tensor` shaped:
          `[batch_size, max_time, cell.output_size]`.

        If time_major == True, this will be a `Tensor` shaped:
          `[max_time, batch_size, cell.output_size]`.

        Note, if `cell.output_size` is a (possibly nested) tuple of integers
        or `TensorShape` objects, then `outputs` will be a tuple having the
        same structure as `cell.output_size`, containing Tensors having shapes
        corresponding to the shape data in `cell.output_size`.

      state: The final state.  If `cell.state_size` is an int, this
        will be shaped `[batch_size, cell.state_size]`.  If it is a
        `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
        If it is a (possibly nested) tuple of ints or `TensorShape`, this will
        be a tuple having the corresponding shapes.

  Raises:
    TypeError: If `cell` is not an instance of RNNCell.
    ValueError: If inputs is None or an empty list.
  """

    # pylint: disable=protected-access
    if not isinstance(cell, rnn_cell_impl._RNNCell):
        raise TypeError("cell must be an instance of RNNCell")
    # pylint: enable=protected-access

    # By default, time_major==False and inputs are batch-major: shaped
    #   [batch, time, depth]
    # For internal calculations, we transpose to [time, batch, depth]
    flat_input = nest.flatten(inputs)

    if not time_major:
        # (B,T,D) => (T,B,D)
        flat_input = tuple(
            array_ops.transpose(input_, [1, 0, 2]) for input_ in flat_input)

    parallel_iterations = parallel_iterations or 32
    if sequence_length is not None:
        sequence_length = math_ops.to_int32(sequence_length)
        if sequence_length.get_shape().ndims not in (None, 1):
            raise ValueError(
                "sequence_length must be a vector of length batch_size, "
                "but saw shape: %s" % sequence_length.get_shape())
        sequence_length = array_ops.identity(  # Just to find it in the graph.
            sequence_length,
            name="sequence_length")

    # Create a new scope in which the caching device is either
    # determined by the parent scope, or is set to place the cached
    # Variable using the same placement as for the rest of the RNN.
    with vs.variable_scope(scope or "rnn") as varscope:
        if varscope.caching_device is None:
            varscope.set_caching_device(lambda op: op.device)
        input_shape = tuple(array_ops.shape(input_) for input_ in flat_input)
        batch_size = input_shape[0][1]

        for input_ in input_shape:
            if input_[1].get_shape() != batch_size.get_shape():
                raise ValueError("All inputs should have the same batch size")

        if initial_state is not None:
            state = initial_state
        else:
            if not dtype:
                raise ValueError(
                    "If no initial_state is provided, dtype must be.")
            state = cell.zero_state(batch_size, dtype)

        def _assert_has_shape(x, shape):
            x_shape = array_ops.shape(x)
            packed_shape = array_ops.stack(shape)
            return control_flow_ops.Assert(
                math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), [
                    "Expected shape for Tensor %s is " % x.name, packed_shape,
                    " but saw shape: ", x_shape
                ])

        if sequence_length is not None:
            # Perform some shape validation
            with ops.control_dependencies(
                [_assert_has_shape(sequence_length, [batch_size])]):
                sequence_length = array_ops.identity(sequence_length,
                                                     name="CheckSeqLen")

        inputs = nest.pack_sequence_as(structure=inputs,
                                       flat_sequence=flat_input)

        (outputs, final_state) = _dynamic_rnn_loop(
            cell,
            inputs,
            state,
            parallel_iterations=parallel_iterations,
            swap_memory=swap_memory,
            sequence_length=sequence_length,
            dtype=dtype)

        # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
        # If we are performing batch-major calculations, transpose output back
        # to shape [batch, time, depth]
        if not time_major:
            # (T,B,D) => (B,T,D)
            flat_output = nest.flatten(outputs)
            flat_output = [
                array_ops.transpose(output, [1, 0, 2])
                for output in flat_output
            ]
            outputs = nest.pack_sequence_as(structure=outputs,
                                            flat_sequence=flat_output)

        return (outputs, final_state)
Example #38
0
    def decode(self, knowledge_rep, masks, initial_state=(None, None)):
        """
        takes in a knowledge representation
        and output a probability estimation over
        all paragraph tokens on which token should be
        the start of the answer span, and which should be
        the end of the answer span.

        :param knowledge_rep: it is a representation of the paragraph and question,
                              decided by how you choose to implement the encoder
        :return:
        """

        with vs.variable_scope("decoder"):
            #initial_state=(None,None)
            with vs.variable_scope("answer_start"):
                cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_size)
                cell = tf.nn.rnn_cell.DropoutWrapper(
                    cell, output_keep_prob=self.dropout)

                start_states, start_final_state = tf.nn.bidirectional_dynamic_rnn(
                    cell,
                    cell,
                    knowledge_rep,
                    sequence_length=masks,
                    initial_state_fw=initial_state[0],
                    initial_state_bw=initial_state[1],
                    dtype=tf.float32)
                start_states = start_states[0] + start_states[1]

                #start_states, start_final_state = tf.nn.dynamic_rnn(cell, knowledge_rep, sequence_length=masks, dtype=tf.float32)
                start_states_reshaped = tf.reshape(start_states,
                                                   [-1, self.hidden_size])
                start_probs = tf.nn.rnn_cell._linear(start_states_reshaped,
                                                     output_size=1,
                                                     bias=True)
                start_probs = tf.reshape(start_probs, [-1, self.output_size])

            with vs.variable_scope("answer_end"):
                cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_size)
                cell = tf.nn.rnn_cell.DropoutWrapper(
                    cell, output_keep_prob=self.dropout)
                end_states, end_final_state = tf.nn.bidirectional_dynamic_rnn(
                    cell,
                    cell,
                    knowledge_rep,
                    sequence_length=masks,
                    initial_state_fw=start_final_state[0],
                    initial_state_bw=start_final_state[1],
                    dtype=tf.float32)

                end_states = end_states[0] + end_states[1]
                #end_states, end_final_state = tf.nn.dynamic_rnn(cell, knowledge_rep,initial_state=start_final_state, sequence_length=masks, dtype=tf.float32)
                end_states_reshaped = tf.reshape(end_states,
                                                 [-1, self.hidden_size])
                end_probs = tf.nn.rnn_cell._linear(end_states_reshaped,
                                                   output_size=1,
                                                   bias=True)
                end_probs = tf.reshape(end_probs, [-1, self.output_size])

            # Masking
            bool_masks = tf.cast(
                tf.sequence_mask(masks, maxlen=self.output_size), tf.float32)
            add_mask = (-1e30 * (1.0 - bool_masks))
            #add_mask = tf.log(bool_masks)
            start_probs = tf.add(start_probs, add_mask)
            end_probs = tf.add(end_probs, add_mask)

        return start_probs, end_probs
Example #39
0
def _deepfm_model_fn(features,
                     labels,
                     mode,
                     head,
                     fm_first_feature_columns=None,
                     fm_second_feature_columns=None,
                     embedding_size=None,
                     field_size=None,
                     linear_optimizer='Ftrl',
                     dnn_feature_columns=None,
                     dnn_optimizer='Adagrad',
                     dnn_hidden_units=None,
                     dnn_activation_fn=nn.relu,
                     dnn_dropout=None,
                     input_layer_partitioner=None,
                     config=None):
    """DNN and FM combined model_fn.

  Args:
    features: dict of `Tensor`.
    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype
      `int32` or `int64` in the range `[0, n_classes)`.
    mode: Defines whether this is training, evaluation or prediction.
      See `ModeKeys`.
    head: A `Head` instance.
    fm_first_feature_columns: An iterable containing order-1 feature columns used
      by the fm model.
    fm_second_feature_columns: An iterable containing order-2 feature columns used
      by the fm model.
    embedding_size: input field vectors can be of different sizes, their embeddings are
      of the same size.
    field_size: The number of order-2 feature columns.
    linear_optimizer: string, `Optimizer` object, or callable that defines the
      optimizer to use for training the FM model. Defaults to the Ftrl
      optimizer.
    dnn_feature_columns: An iterable containing all the feature columns used by
      the DNN model.
    dnn_optimizer: string, `Optimizer` object, or callable that defines the
      optimizer to use for training the DNN model. Defaults to the Adagrad
      optimizer.
    dnn_hidden_units: List of hidden units per DNN layer.
    dnn_activation_fn: Activation function applied to each DNN layer. If `None`,
      will use `tf.nn.relu`.
    dnn_dropout: When not `None`, the probability we will drop out a given DNN
      coordinate.
    input_layer_partitioner: Partitioner for input layer.
    config: `RunConfig` object to configure the runtime settings.

  Returns:
    An `EstimatorSpec` instance.

  Raises:
    ValueError: If both `fm_first_feature_columns` and `fm_second_feature_columns`
      and `dnn_features_columns` are em pty at the same time, or `input_layer_partitioner`
      is missing, or features has the wrong type.
  """
    if not isinstance(features, dict):
        raise ValueError('features should be a dictionary of `Tensor`s. '
                         'Given type: {}'.format(type(features)))
    if not fm_first_feature_columns and not dnn_feature_columns and not fm_second_feature_columns:
        raise ValueError(
            'Either fm_first_feature_columns or dnn_feature_columns or fm_second_feature_columns must be defined.'
        )

    num_ps_replicas = config.num_ps_replicas if config else 0
    input_layer_partitioner = input_layer_partitioner or (
        partitioned_variables.min_max_variable_partitioner(
            max_partitions=num_ps_replicas, min_slice_size=64 << 20))

    # Build DNN Logits.
    dnn_parent_scope = 'dnn'

    if not dnn_feature_columns:
        dnn_logits = None
    else:
        dnn_optimizer = optimizers.get_optimizer_instance(
            dnn_optimizer, learning_rate=_DNN_LEARNING_RATE)
        _check_no_sync_replicas_optimizer(dnn_optimizer)
        if not dnn_hidden_units:
            raise ValueError(
                'dnn_hidden_units must be defined when dnn_feature_columns is '
                'specified.')
        dnn_partitioner = (partitioned_variables.min_max_variable_partitioner(
            max_partitions=num_ps_replicas))
        with variable_scope.variable_scope(dnn_parent_scope,
                                           values=tuple(
                                               six.itervalues(features)),
                                           partitioner=dnn_partitioner):

            dnn_logit_fn = dnn._dnn_logit_fn_builder(
                units=head.logits_dimension,
                hidden_units=dnn_hidden_units,
                feature_columns=dnn_feature_columns,
                activation_fn=dnn_activation_fn,
                dropout=dnn_dropout,
                input_layer_partitioner=input_layer_partitioner)
            dnn_logits = dnn_logit_fn(features=features, mode=mode)

    # Build FM Logits.
    fm_parent_scope = 'fm'

    def cal_fm_first_logits():
        logit_fn = linear._linear_logit_fn_builder(
            units=head.logits_dimension,
            feature_columns=fm_first_feature_columns)
        fm_first_logits = logit_fn(features=features)
        _add_layer_summary(fm_first_logits, scope.name)
        return fm_first_logits

    def cal_fm_second_logits():
        embeddings = tf.feature_column.input_layer(
            features=features, feature_columns=fm_second_feature_columns)
        embeddings = tf.reshape(embeddings,
                                shape=[-1, field_size, embedding_size])
        sum_square = tf.square(tf.reduce_sum(embeddings, 1))
        square_sum = tf.reduce_sum(tf.square(embeddings), 1)
        fm_second_logits = 0.5 * tf.reduce_sum(
            tf.subtract(sum_square, square_sum), 1, keep_dims=True)
        _add_layer_summary(fm_second_logits, scope.name)
        return fm_second_logits

    if not fm_first_feature_columns and not fm_second_feature_columns:
        fm_first_logits = None
        fm_second_logits = None
    else:
        linear_optimizer = optimizers.get_optimizer_instance(
            linear_optimizer,
            learning_rate=_fm_learning_rate(
                len(fm_first_feature_columns) +
                len(fm_second_feature_columns)))
        _check_no_sync_replicas_optimizer(linear_optimizer)
        with variable_scope.variable_scope(
                fm_parent_scope,
                values=tuple(six.itervalues(features)),
                partitioner=input_layer_partitioner) as scope:
            if not fm_first_feature_columns:
                fm_first_logits = None
                fm_second_logits = cal_fm_second_logits()
            elif not fm_second_feature_columns:
                fm_second_logits = None
                fm_first_logits = cal_fm_first_logits()
            else:
                fm_first_logits = cal_fm_first_logits()
                fm_second_logits = cal_fm_second_logits()

    def add_logits(logits, to_add_logits):
        if logits is None:
            return to_add_logits
        else:
            return logits + to_add_logits if to_add_logits is not None else logits

    # Combine logits and build full model.
    logits = None
    logits = add_logits(logits, dnn_logits)
    logits = add_logits(logits, fm_second_logits)
    logits = add_logits(logits, fm_first_logits)

    def _train_op_fn(loss):
        """Returns the op to optimize the loss."""
        train_ops = []
        global_step = training_util.get_global_step()
        if dnn_logits is not None:
            train_ops.append(
                dnn_optimizer.minimize(loss,
                                       var_list=ops.get_collection(
                                           ops.GraphKeys.TRAINABLE_VARIABLES,
                                           scope=dnn_parent_scope)))
        if fm_first_logits is not None or fm_second_logits is not None:
            train_ops.append(
                linear_optimizer.minimize(
                    loss,
                    var_list=ops.get_collection(
                        ops.GraphKeys.TRAINABLE_VARIABLES,
                        scope=fm_parent_scope)))
        train_op = control_flow_ops.group(*train_ops)
        with ops.control_dependencies([train_op]):
            return distribute_lib.increment_var(global_step)

    return head.create_estimator_spec(features=features,
                                      mode=mode,
                                      labels=labels,
                                      train_op_fn=_train_op_fn,
                                      logits=logits)
Example #40
0
    def _create_definition_if_needed_impl(self):
        """This is not what you want, see _create_definition_if_needed."""
        if self._definition is not None or self._c_func is not None:
            return

        # Create the func_def object.
        temp_graph = _FuncGraph(capture_by_value=self._capture_by_value)
        with temp_graph.as_default():
            # List of placeholders for the function_def.
            inputs = []
            for (argname, argtype) in self._args:
                argholder = array_ops.placeholder(argtype, name=argname)
                inputs.append(argholder)
            # Call func and gather the output tensors.
            with vs.variable_scope("", custom_getter=temp_graph.getvar):
                outputs = self._func(*inputs)

            # There is no way of distinguishing between a function not returning
            # anything and a function returning None in Python.
            # We need to allow the former and ideally want to forbid the latter as
            # it is most likely user error.
            # TODO(iga): Consider adding a @NoOutput decorator on top of @Defun to
            # allow users to explicitly mark the function as not returning anything.
            # For now, we allow a single None return and interpret it as a function
            # with no output.
            if outputs is None:
                outputs = []
            else:
                # If func only returned one value, make it a tuple.
                if not isinstance(outputs, (list, tuple)):
                    outputs = (outputs, )
                if any([_ is None for _ in outputs]):
                    raise ValueError("Function can not return None.")
            # Ensures each output is a Tensor in the function graph.
            outputs = [ops.convert_to_tensor(t) for t in outputs]
            outputs = [
                temp_graph.capture(t) if t.graph is not temp_graph else t
                for t in outputs
            ]
        self._extra_inputs = temp_graph.extra_inputs
        inputs.extend(temp_graph.extra_args)
        # pylint: disable=protected-access
        self._sub_functions = temp_graph._functions
        # pylint: enable=protected-access

        # Extra kwargs are treated as attrs on the function def.
        if self._func_name:
            base_func_name = self._func_name
        else:
            base_func_name = _get_func_name(self._func)
            if self._grad_func:
                base_func_name += ("_%s" % self._grad_func.name)
        kwargs_attr = _parse_kwargs_as_attrs(base_func_name,
                                             **self._extra_kwargs)

        if not temp_graph._c_graph:  # pylint: disable=protected-access
            # Build the FunctionDef
            self._definition = graph_to_function_def.graph_to_function_def(
                temp_graph,
                temp_graph.get_operations(),
                inputs,
                outputs,
                out_names=self._out_names)

            for k in kwargs_attr:
                self._definition.attr[k].CopyFrom(kwargs_attr[k])

            # Hash the definition and its dependencies.
            self._hash_str = self._create_hash_str(
                self._definition.signature.input_arg,
                self._definition.signature.output_arg,
                self._definition.node_def)

            # Finally, we decide the function name to use.  If not specified,
            # make up something which is almost certainly unique (but deterministic).
            if not self._func_name:
                self._func_name = "_".join([base_func_name, self._hash_str])
            self._definition.signature.name = self._func_name
            if self._func.__doc__:
                self._definition.signature.description = self._func.__doc__

            self._op_def = self._definition.signature
        else:  # C API is enabled
            output_names = ([compat.as_bytes(x) for x in self._out_names]
                            if self._out_names else [])
            description = self._func.__doc__ or None
            # pylint: disable=protected-access
            c_func = c_api.TF_GraphToFunction_wrapper(
                temp_graph._c_graph,
                base_func_name,
                self._func_name is None,  # append_hash_to_fn_name
                None,  # opers
                [t._as_tf_output() for t in inputs],
                [t._as_tf_output() for t in outputs],
                output_names,
                None,  # opts
                description)
            self._c_func = c_api_util.ScopedTFFunction(c_func)
            # pylint: enable=protected-access
            self._set_c_attrs(kwargs_attr)

            # Set cached fields: _op_def and _func_name (if not already set)
            self._op_def = self.definition.signature
            if self._func_name:
                assert self._func_name == self._op_def.name
            else:
                self._func_name = compat.as_str(self._op_def.name)
Example #41
0
def pointer_decoder(decoder_inputs,
                    initial_state,
                    attention_states,
                    ori_encoder_inputs,
                    cell,
                    feed_prev=False,
                    dtype=dtypes.float32,
                    scope=None):
    """RNN decoder with pointer net for the sequence-to-sequence model.
    Args:
      decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "pointer_decoder".
    Returns:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
        [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either i-th decoder_inputs.
        First, we run the cell
        on a combination of the input and previous attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      states: The state of each decoder cell in each time-step. This is a list
        with length len(decoder_inputs) -- one item for each time-step.
        Each item is a 2D Tensor of shape [batch_size x cell.state_size].
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())

    with vs.variable_scope(scope or "point_decoder"):
        batch_size = array_ops.shape(
            decoder_inputs[0])[0]  # Needed for reshaping.
        input_size = decoder_inputs[0].get_shape()[1].value
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])

        attention_vec_size = attn_size  # Size of query vectors for attention.
        k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size])
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        v = vs.get_variable("AttnV", [attention_vec_size])

        states = [initial_state]

        def attention(query):
            """Point on hidden using hidden_features and query."""
            with vs.variable_scope("Attention"):
                y = core_rnn_cell_impl._linear(query, attention_vec_size, True)
                y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y),
                                        [2, 3])
                return s

        outputs = []
        prev = None
        batch_attn_size = array_ops.stack([batch_size, attn_size])
        attns = array_ops.zeros(batch_attn_size, dtype=dtype)

        attns.set_shape([None, attn_size])
        inps = []
        for i in range(len(decoder_inputs)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]

            if feed_prev and i > 0:
                inp = tf.stack(ori_encoder_inputs)
                inp = tf.transpose(inp, perm=[1, 0, 2])
                inp = tf.reshape(inp, [-1, attn_length, input_size])
                inp = tf.reduce_sum(
                    inp *
                    tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1)
                inp = tf.stop_gradient(inp)
                inps.append(inp)

            # Use the same inputs in inference, order internaly

            # Merge input and previous attentions into one vector of the right size.
            x = core_rnn_cell_impl._linear([inp, attns], cell.output_size,
                                           True)
            # Run the RNN.
            cell_output, new_state = cell(x, states[-1])
            states.append(new_state)
            # Run the attention mechanism.
            output = attention(new_state)

            outputs.append(output)

    return outputs, states, inps
def l2_normalization(inputs,
                     scaling=False,
                     scale_initializer=init_ops.ones_initializer(),
                     reuse=None,
                     variables_collections=None,
                     outputs_collections=None,
                     data_format='NHWC',
                     trainable=True,
                     scope=None):
    """Implement L2 normalization on every feature (i.e. spatial normalization).
    Should be extended in some near future to other dimensions, providing a more
    flexible normalization framework.
    Args:
      inputs: a 4-D tensor with dimensions [batch_size, height, width, channels].
      scaling: whether or not to add a post scaling operation along the dimensions
        which have been normalized.
      scale_initializer: An initializer for the weights.
      reuse: whether or not the layer and its variables should be reused. To be
        able to reuse the layer scope must be given.
      variables_collections: optional list of collections for all the variables or
        a dictionary containing a different list of collection per variable.
      outputs_collections: collection to add the outputs.
      data_format:  NHWC or NCHW data format.
      trainable: If `True` also add variables to the graph collection
        `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
      scope: Optional scope for `variable_scope`.
    Returns:
      A `Tensor` representing the output of the operation.
    """

    with variable_scope.variable_scope(scope,
                                       'L2Normalization', [inputs],
                                       reuse=reuse) as sc:
        inputs_shape = inputs.get_shape()
        inputs_rank = inputs_shape.ndims
        dtype = inputs.dtype.base_dtype
        if data_format == 'NHWC':
            # norm_dim = tf.range(1, inputs_rank-1)
            norm_dim = tf.range(inputs_rank - 1, inputs_rank)
            params_shape = inputs_shape[-1:]
        elif data_format == 'NCHW':
            # norm_dim = tf.range(2, inputs_rank)
            norm_dim = tf.range(1, 2)
            params_shape = (inputs_shape[1])

        # Normalize along spatial dimensions.
        outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12)
        # Additional scaling.
        if scaling:
            scale_collections = utils.get_variable_collections(
                variables_collections, 'scale')
            scale = variables.model_variable('gamma',
                                             shape=params_shape,
                                             dtype=dtype,
                                             initializer=scale_initializer,
                                             collections=scale_collections,
                                             trainable=trainable)
            if data_format == 'NHWC':
                outputs = tf.multiply(outputs, scale)
            elif data_format == 'NCHW':
                scale = tf.expand_dims(scale, axis=-1)
                scale = tf.expand_dims(scale, axis=-1)
                outputs = tf.multiply(outputs, scale)
                # outputs = tf.transpose(outputs, perm=(0, 2, 3, 1))

        return utils.collect_named_outputs(outputs_collections,
                                           sc.original_name_scope, outputs)
Example #43
0
def streaming_tp_fp_arrays(num_gbboxes, tp, fp, scores,
                           remove_zero_scores=True,
                           metrics_collections=None,
                           updates_collections=None,
                           name=None):
    """Streaming computation of True and False Positive arrays. This metrics
    also keeps track of scores and number of grountruth objects.
    """
    # Input dictionaries: dict outputs as streaming metrics.
    if isinstance(scores, dict) or isinstance(fp, dict):
        d_values = {}
        d_update_ops = {}
        for c in num_gbboxes.keys():
            scope = 'streaming_tp_fp_%s' % c
            v, up = streaming_tp_fp_arrays(num_gbboxes[c], tp[c], fp[c], scores[c],
                                           remove_zero_scores,
                                           metrics_collections,
                                           updates_collections,
                                           name=scope)
            d_values[c] = v
            d_update_ops[c] = up
        return d_values, d_update_ops

    # Input Tensors...
    with variable_scope.variable_scope(name, 'streaming_tp_fp',
                                       [num_gbboxes, tp, fp, scores]):
        num_gbboxes = math_ops.to_int64(num_gbboxes)
        scores = math_ops.to_float(scores)
        stype = tf.bool
        tp = tf.cast(tp, stype)
        fp = tf.cast(fp, stype)
        # Reshape TP and FP tensors and clean away 0 class values.
        scores = tf.reshape(scores, [-1])
        tp = tf.reshape(tp, [-1])
        fp = tf.reshape(fp, [-1])
        # Remove TP and FP both false.
        mask = tf.logical_or(tp, fp)
        if remove_zero_scores:
            rm_threshold = 1e-4
            mask = tf.logical_and(mask, tf.greater(scores, rm_threshold))
            scores = tf.boolean_mask(scores, mask)
            tp = tf.boolean_mask(tp, mask)
            fp = tf.boolean_mask(fp, mask)

        # Local variables accumlating information over batches.
        v_nobjects = _create_local('v_num_gbboxes', shape=[], dtype=tf.int64)
        v_ndetections = _create_local('v_num_detections', shape=[], dtype=tf.int32)
        v_scores = _create_local('v_scores', shape=[0, ])
        v_tp = _create_local('v_tp', shape=[0, ], dtype=stype)
        v_fp = _create_local('v_fp', shape=[0, ], dtype=stype)

        # Update operations.
        nobjects_op = state_ops.assign_add(v_nobjects,
                                           tf.reduce_sum(num_gbboxes))
        ndetections_op = state_ops.assign_add(v_ndetections,
                                              tf.size(scores, out_type=tf.int32))
        scores_op = state_ops.assign(v_scores, tf.concat([v_scores, scores], axis=0),
                                     validate_shape=False)
        tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp], axis=0),
                                 validate_shape=False)
        fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp], axis=0),
                                 validate_shape=False)

        # Value and update ops.
        val = (v_nobjects, v_ndetections, v_tp, v_fp, v_scores)
        with ops.control_dependencies([nobjects_op, ndetections_op,
                                       scores_op, tp_op, fp_op]):
            update_op = (nobjects_op, ndetections_op, tp_op, fp_op, scores_op)

        if metrics_collections:
            ops.add_to_collections(metrics_collections, val)
        if updates_collections:
            ops.add_to_collections(updates_collections, update_op)
        return val, update_op
def resnet_v1(inputs,
              blocks,
              num_classes=None,
              is_training=True,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
              reuse=None,
              scope=None):
    """Generator for v1 ResNet models.

  This function generates a family of ResNet v1 models. See the resnet_v1_*()
  methods for specific model instantiations, obtained by selecting different
  block instantiations that produce ResNets of various depths.

  Training for image classification on Imagenet is usually done with [224, 224]
  inputs, resulting in [7, 7] feature maps at the output of the last ResNet
  block for the ResNets defined in [1] that have nominal stride equal to 32.
  However, for dense prediction tasks we advise that one uses inputs with
  spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
  this case the feature maps at the ResNet output will have spatial shape
  [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
  and corners exactly aligned with the input image corners, which greatly
  facilitates alignment of the features to the image. Using as input [225, 225]
  images results in [8, 8] feature maps at the output of the last ResNet block.

  For dense prediction tasks, the ResNet needs to run in fully-convolutional
  (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
  have nominal stride equal to 32 and a good choice in FCN mode is to use
  output_stride=16 in order to increase the density of the computed features at
  small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.

  Args:
    inputs: A tensor of size [batch, height_in, width_in, channels].
    blocks: A list of length equal to the number of ResNet blocks. Each element
      is a resnet_utils.Block object describing the units in the block.
    num_classes: Number of predicted classes for classification tasks. If None
      we return the features before the logit layer.
    is_training: whether batch_norm layers are in training mode.
    global_pool: If True, we perform global average pooling before computing the
      logits. Set to True for image classification, False for dense prediction.
    output_stride: If None, then the output will be computed at the nominal
      network stride. If output_stride is not None, it specifies the requested
      ratio of input to output spatial resolution.
    include_root_block: If True, include the initial convolution followed by
      max-pooling, if False excludes it.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.

  Returns:
    net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
      If global_pool is False, then height_out and width_out are reduced by a
      factor of output_stride compared to the respective height_in and width_in,
      else both height_out and width_out equal one. If num_classes is None, then
      net is the output of the last ResNet block, potentially after global
      average pooling. If num_classes is not None, net contains the pre-softmax
      activations.
    end_points: A dictionary from components of the network to the corresponding
      activation.

  Raises:
    ValueError: If the target output_stride is not valid.
  """
    with variable_scope.variable_scope(scope,
                                       'resnet_v1', [inputs],
                                       reuse=reuse) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with arg_scope(
            [layers.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
                outputs_collections=end_points_collection):
            with arg_scope([layers.batch_norm], is_training=is_training):
                net = inputs
                if include_root_block:
                    if output_stride is not None:
                        if output_stride % 4 != 0:
                            raise ValueError(
                                'The output_stride needs to be a multiple of 4.'
                            )
                        output_stride /= 4
                    net = resnet_utils.conv2d_same(net,
                                                   64,
                                                   7,
                                                   stride=2,
                                                   scope='conv1')
                    net = layers_lib.max_pool2d(net, [3, 3],
                                                stride=2,
                                                scope='pool1')
                net = resnet_utils.stack_blocks_dense(net, blocks,
                                                      output_stride)
                if global_pool:
                    # Global average pooling.
                    net = math_ops.reduce_mean(net, [1, 2],
                                               name='pool5',
                                               keep_dims=True)
                if num_classes is not None:
                    net = layers.conv2d(net,
                                        num_classes, [1, 1],
                                        activation_fn=None,
                                        normalizer_fn=None,
                                        scope='logits')
                # Convert end_points_collection into a dictionary of end_points.
                end_points = utils.convert_collection_to_dict(
                    end_points_collection)
                if num_classes is not None:
                    end_points['predictions'] = layers_lib.softmax(
                        net, scope='predictions')
                return net, end_points
Example #45
0
	def __init__(self, config, mode, forward_only, cell_mode=None, no_previous=False, max_cell_length=None):
		super(DBRNNModel, self).__init__(config, mode, cell_mode=cell_mode, no_previous=no_previous,
										max_cell_length=max_cell_length)
		self.cell_fw = self.cell
		self.cell_bw = self.cell
		output_projection_forward = None
		output_projection_backward = None
		softmax_loss_function_forward = None
		softmax_loss_function_backward = None

		# forward output brnn sampled output projection
		with vs.variable_scope('forward_output_linear'):
			# sampled softmax
			if self.num_samples and self.num_samples < self.num_output_symbols:
				w_forward = tf.get_variable("Forward_proj_w", [self.cell_units, self.num_output_symbols])
				w_t_forward = tf.transpose(w_forward)
				b_forward = tf.get_variable("Forward_proj_b", [self.num_output_symbols])
				output_projection_forward = (w_forward, b_forward)
				def sampled_loss_forward(labels, inputs):
					labels = tf.reshape(labels, [-1, 1])
					local_w_t = tf.cast(w_t_forward, tf.float32)
					local_b = tf.cast(b_forward, tf.float32)
					local_inputs = tf.cast(inputs, tf.float32)
					return tf.nn.sampled_softmax_loss(
							weights=local_w_t,
							biases=local_b,
							labels=labels,
							inputs=local_inputs,
							num_sampled=self.num_samples,
							num_classes=self.num_output_symbols)
				softmax_loss_function_forward = sampled_loss_forward

		# backward output brnn sampled output projection
		with vs.variable_scope('backward_output_linear'):
			# sampled softmax
			if self.num_samples and self.num_samples < self.num_output_symbols:
				w_backward = tf.get_variable("Backward_proj_w", [self.cell_units, self.num_output_symbols])
				w_t_backward = tf.transpose(w_backward)
				b_backward = tf.get_variable("Backward_proj_b", [self.num_output_symbols])
				output_projection_backward = (w_backward, b_backward)
				def sampled_loss_backward(labels, inputs):
					labels = tf.reshape(labels, [-1, 1])
					local_w_t = tf.cast(w_t_backward, tf.float32)
					local_b = tf.cast(b_backward, tf.float32)
					local_inputs = tf.cast(inputs, tf.float32)
					return tf.nn.sampled_softmax_loss(
							weights=local_w_t,
							biases=local_b,
							labels=labels,
							inputs=local_inputs,
							num_sampled=self.num_samples,
							num_classes=self.num_output_symbols)
				softmax_loss_function_backward = sampled_loss_backward

		with vs.variable_scope('Dependent_BRNN_Model'):
			# make sampled softmax
			output_projection = None
			softmax_loss_function = None
			if self.num_samples and self.num_samples < self.num_output_symbols:
				w = tf.get_variable("proj_w", [self.cell_fw.output_size+self.cell_bw.output_size, self.num_output_symbols])
				w_t = tf.transpose(w)
				b = tf.get_variable("proj_b", [self.num_output_symbols])
				output_projection = (w, b)
				def sampled_loss(labels, inputs):
					labels = tf.reshape(labels, [-1, 1])
					local_w_t = tf.cast(w_t, tf.float32)
					local_b = tf.cast(b, tf.float32)
					local_inputs = tf.cast(inputs, tf.float32)
					return tf.nn.sampled_softmax_loss(
						weights=local_w_t,
						biases=local_b,
						labels=labels,
						inputs=local_inputs,
						num_sampled=self.num_samples,
						num_classes=self.num_output_symbols)
				softmax_loss_function = sampled_loss

			self.brnn_outputs, self.state = model_utils.dependent_brnn(
				self.inputs, self.cell_fw, self.cell_bw,
				num_input_symbols=self.num_input_symbols,
				num_output_symbols=self.num_output_symbols,
				embedding_size=self.embedding_size,
				output_projection_fw=output_projection_forward,
				output_projection_bw=output_projection_backward,
				not_shared=self.not_shared)

			self.losses_fw = model_utils.sequence_loss(self.brnn_outputs[0],
        										self.targets,
        										self.weights,
        										softmax_loss_function=softmax_loss_function_forward)
			self.losses_bw = model_utils.sequence_loss(self.brnn_outputs[1],
        										self.targets,
        										self.weights,
        										softmax_loss_function=softmax_loss_function_backward)
			# Combine the output
			self.outputs = []
			for time_step in xrange(len(self.brnn_outputs[0])):
				with vs.variable_scope(
          				vs.get_variable_scope(), reuse=True if time_step > 0 else None):
					self.outputs.append(model_utils.linear(array_ops.concat([self.brnn_outputs[0][time_step], self.brnn_outputs[1][time_step]], -1), 
														self.num_output_symbols,scope='output_projection'))
			self.losses = model_utils.sequence_loss(self.outputs,
        										self.targets,
        										self.weights,
        										softmax_loss_function=softmax_loss_function)

		# Gradients and SGD update operation for training the model.
		all_params = tf.compat.v1.trainable_variables()
		params_fw = [p for p in all_params if p.name.find('input_brnn')!=-1 or p.name.find('output_brnn/FW')!=-1]
		params_bw = [p for p in all_params if p.name.find('input_brnn')!=-1 or p.name.find('output_brnn/BW')!=-1]
		# shared, provides better performance
		params = all_params
		# not shared
		#params = [p for p in all_params if p not in params_fw and p not in params_bw]
		if not forward_only:
			self.gradient_norms = []
			self.updates = []
			#opt = tf.train.AdamOptimizer(self.learning_rate)
			opt = tf.compat.v1.train.AdagradOptimizer(self.learning_rate)
			gradients = tf.gradients(self.losses, params)
			gradients_fw = tf.gradients(self.losses_fw, params_fw)
			gradients_bw = tf.gradients(self.losses_bw, params_bw)
			clipped_gradients, norm = tf.clip_by_global_norm(gradients, 
													self.max_gradient_norm)
			clipped_gradients_fw, norm_fw = tf.clip_by_global_norm(gradients_fw, 
													self.max_gradient_norm)
			clipped_gradients_bw, norm_bw = tf.clip_by_global_norm(gradients_bw, 
													self.max_gradient_norm)
			self.gradient_norms.append(norm)
			self.gradient_norms.append(norm_fw)
			self.gradient_norms.append(norm_bw)
			self.updates.append(opt.apply_gradients( 
				zip(clipped_gradients, params)))
			self.updates.append(opt.apply_gradients( 
				zip(clipped_gradients_fw, params_fw)))
			self.updates.append(opt.apply_gradients( 
				zip(clipped_gradients_bw, params_bw), global_step=self.global_step))
		self.saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=self.max_checkpoints_to_keep)
Example #46
0
def streaming_precision_recall_arrays(n_gbboxes, rclasses, rscores,
                                      tp_tensor, fp_tensor,
                                      remove_zero_labels=True,
                                      metrics_collections=None,
                                      updates_collections=None,
                                      name=None):
    """Streaming computation of precision / recall arrays. This metrics
    keeps tracks of boolean True positives and False positives arrays.
    """
    with variable_scope.variable_scope(name, 'stream_precision_recall',
                                       [n_gbboxes, rclasses, tp_tensor, fp_tensor]):
        n_gbboxes = math_ops.to_int64(n_gbboxes)
        rclasses = math_ops.to_int64(rclasses)
        rscores = math_ops.to_float(rscores)

        stype = tf.int32
        tp_tensor = tf.cast(tp_tensor, stype)
        fp_tensor = tf.cast(fp_tensor, stype)

        # Reshape TP and FP tensors and clean away 0 class values.
        rclasses = tf.reshape(rclasses, [-1])
        rscores = tf.reshape(rscores, [-1])
        tp_tensor = tf.reshape(tp_tensor, [-1])
        fp_tensor = tf.reshape(fp_tensor, [-1])
        if remove_zero_labels:
            mask = tf.greater(rclasses, 0)
            rclasses = tf.boolean_mask(rclasses, mask)
            rscores = tf.boolean_mask(rscores, mask)
            tp_tensor = tf.boolean_mask(tp_tensor, mask)
            fp_tensor = tf.boolean_mask(fp_tensor, mask)

        # Local variables accumlating information over batches.
        v_nobjects = _create_local('v_nobjects', shape=[], dtype=tf.int64)
        v_ndetections = _create_local('v_ndetections', shape=[], dtype=tf.int32)
        v_scores = _create_local('v_scores', shape=[0, ])
        v_tp = _create_local('v_tp', shape=[0, ], dtype=stype)
        v_fp = _create_local('v_fp', shape=[0, ], dtype=stype)

        # Update operations.
        nobjects_op = state_ops.assign_add(v_nobjects,
                                           tf.reduce_sum(n_gbboxes))
        ndetections_op = state_ops.assign_add(v_ndetections,
                                              tf.size(rscores, out_type=tf.int32))
        scores_op = state_ops.assign(v_scores, tf.concat([v_scores, rscores], axis=0),
                                     validate_shape=False)
        tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp_tensor], axis=0),
                                 validate_shape=False)
        fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp_tensor], axis=0),
                                 validate_shape=False)

        # Precision and recall computations.
        # r = _precision_recall(nobjects_op, scores_op, tp_op, fp_op, 'value')
        r = _precision_recall(v_nobjects, v_ndetections, v_scores,
                              v_tp, v_fp, 'value')

        with ops.control_dependencies([nobjects_op, ndetections_op,
                                       scores_op, tp_op, fp_op]):
            update_op = _precision_recall(nobjects_op, ndetections_op,
                                          scores_op, tp_op, fp_op, 'update_op')

            # update_op = tf.Print(update_op,
            #                      [tf.reduce_sum(tf.cast(mask, tf.int64)),
            #                       tf.reduce_sum(tf.cast(mask2, tf.int64)),
            #                       tf.reduce_min(rscores),
            #                       tf.reduce_sum(n_gbboxes)],
            #                      'Metric: ')
            # Some debugging stuff!
            # update_op = tf.Print(update_op,
            #                      [tf.shape(tp_op),
            #                       tf.reduce_sum(tf.cast(tp_op, tf.int64), axis=0)],
            #                      'TP and FP shape: ')
            # update_op[0] = tf.Print(update_op,
            #                      [nobjects_op],
            #                      '# Groundtruth bboxes: ')
            # update_op = tf.Print(update_op,
            #                      [update_op[0][0],
            #                       update_op[0][-1],
            #                       tf.reduce_min(update_op[0]),
            #                       tf.reduce_max(update_op[0]),
            #                       tf.reduce_min(update_op[1]),
            #                       tf.reduce_max(update_op[1])],
            #                      'Precision and recall :')

        if metrics_collections:
            ops.add_to_collections(metrics_collections, r)
        if updates_collections:
            ops.add_to_collections(updates_collections, update_op)
        return r, update_op
def attention_RNN(encoder_outputs, 
                  encoder_state,
                  num_decoder_symbols,
                  sequence_length,
                  num_heads=1,
                  dtype=dtypes.float32,
                  use_attention=True,
                  loop_function=None,
                  scope=None):
  if use_attention:
    print ('Use the attention RNN model')
    if num_heads < 1:
      raise ValueError("With less than 1 heads, use a non-attention decoder.")
  
    with variable_scope.variable_scope(scope or "attention_RNN"):
      output_size = encoder_outputs[0].get_shape()[1].value
      top_states = [array_ops.reshape(e, [-1, 1, output_size])
                  for e in encoder_outputs]
      attention_states = array_ops.concat(top_states, 1)
      if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                       % attention_states.get_shape())
  
      batch_size = array_ops.shape(top_states[0])[0]  # Needed for reshaping.
      attn_length = attention_states.get_shape()[1].value
      attn_size = attention_states.get_shape()[2].value
  
      # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
      hidden = array_ops.reshape(
          attention_states, [-1, attn_length, 1, attn_size])
      hidden_features = []
      v = []
      attention_vec_size = attn_size  # Size of query vectors for attention.
      for a in xrange(num_heads):
        k = variable_scope.get_variable("AttnW_%d" % a,
                                        [1, 1, attn_size, attention_vec_size])
        hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
        v.append(variable_scope.get_variable("AttnV_%d" % a,
                                             [attention_vec_size]))
  
      def attention(query):
        """Put attention masks on hidden using hidden_features and query."""
        attn_weights = []
        ds = []  # Results of attention reads will be stored here.
        for i in xrange(num_heads):
          with variable_scope.variable_scope("Attention_%d" % i):
            y = rnn_cell_impl._linear(query, attention_vec_size, True)
            y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
            # Attention mask is a softmax of v^T * tanh(...).
            s = math_ops.reduce_sum(
                v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
            a = nn_ops.softmax(s)
            attn_weights.append(a)
            # Now calculate the attention-weighted vector d.
            d = math_ops.reduce_sum(
                array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                [1, 2])
            ds.append(array_ops.reshape(d, [-1, attn_size]))
        return attn_weights, ds
  
      batch_attn_size = array_ops.stack([batch_size, attn_size])
      attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
               for _ in xrange(num_heads)]
      for a in attns:  # Ensure the second shape of attention vectors is set.
        a.set_shape([None, attn_size])
  
      # loop through the encoder_outputs
      attention_encoder_outputs = list()
      sequence_attention_weights = list()
      for i in xrange(len(encoder_outputs)):
        if i > 0:
          variable_scope.get_variable_scope().reuse_variables()
        if i == 0:
          with variable_scope.variable_scope("Initial_Decoder_Attention"):
            initial_state = rnn_cell_impl._linear(encoder_state, output_size, True)
          attn_weights, ds = attention(initial_state)
        else:
          attn_weights, ds = attention(encoder_outputs[i])
        output = array_ops.concat([ds[0], encoder_outputs[i]], 1) # NOTE: here we temporarily assume num_head = 1
        with variable_scope.variable_scope("AttnRnnOutputProjection"):
          logit = rnn_cell_impl._linear(output, num_decoder_symbols, True)
        attention_encoder_outputs.append(logit) # NOTE: here we temporarily assume num_head = 1
        sequence_attention_weights.append(attn_weights[0]) # NOTE: here we temporarily assume num_head = 1
  else:
    print ('Use the NON attention RNN model')
    with variable_scope.variable_scope(scope or "non-attention_RNN"):
      attention_encoder_outputs = list()
      sequence_attention_weights = list()
      
      # copy over logits once out of sequence_length
      if encoder_outputs[0].get_shape().ndims != 1:
        (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2)
      else:
        fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0]

      if fixed_batch_size.value: 
        batch_size = fixed_batch_size.value
      else:
        batch_size = array_ops.shape(encoder_outputs[0])[0]
      if sequence_length is not None:
        sequence_length = math_ops.to_int32(sequence_length)
      if sequence_length is not None:  # Prepare variables
        zero_logit = array_ops.zeros(
            array_ops.pack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype)
        zero_logit.set_shape(
            tensor_shape.TensorShape([fixed_batch_size.value, num_decoder_symbols]))
        min_sequence_length = math_ops.reduce_min(sequence_length)
        max_sequence_length = math_ops.reduce_max(sequence_length)
    
      for time, input_ in enumerate(encoder_outputs):
        if time > 0: variable_scope.get_variable_scope().reuse_variables()
        # pylint: disable=cell-var-from-loop
        # call_cell = lambda: cell(input_, state)
        generate_logit = lambda: rnn_cell_impl._linear(encoder_outputs[time], num_decoder_symbols, True)
        # pylint: enable=cell-var-from-loop
        if sequence_length is not None:
          logit = _step(
              time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit)
        else:
          logit = generate_logit
        attention_encoder_outputs.append(logit)   
        
  return attention_encoder_outputs, sequence_attention_weights
Example #48
0
def roll_attention_decoder(decoder_inputs,
                           initial_state,
                           encoder_states,
                           enc_padding_mask,
                           cell,
                           initial_state_attention=False,
                           pointer_gen=True):
    """
    Args:
      decoder_inputs: A list of 2D Tensors [batch_size x input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      encoder_states: 3D Tensor [batch_size x attn_length x attn_size].
      enc_padding_mask: 2D Tensor [batch_size x attn_length] containing 1s and 0s; indicates which of the encoder locations are padding (0) or a real token (1).
      cell: rnn_cell.RNNCell defining the cell function and size.
      initial_state_attention:
        Note that this attention decoder passes each decoder input through a linear layer with the previous step's context vector to get a modified version of the input. If initial_state_attention is False, on the first decoder step the "previous context vector" is just a zero vector. If initial_state_attention is True, we use initial_state to (re)calculate the previous step's context vector. We set this to False for train/eval mode (because we call attention_decoder once for all decoder steps) and True for decode mode (because we call attention_decoder once for each decoder step).
      pointer_gen: boolean. If True, calculate the generation probability p_gen for each decoder step.

    Returns:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x cell.output_size]. The output vectors.
      state: The final state of the decoder. A tensor shape [batch_size x cell.state_size].
      attn_dists: A list containing tensors of shape (batch_size,attn_length).
        The attention distributions for each decoder step.
      p_gens: List of scalars. The values of p_gen for each decoder step. Empty list if pointer_gen=False.
    """
    with variable_scope.variable_scope("attention_decoder") as scope:
        batch_size = encoder_states.get_shape(
        )[0].value  # if this line fails, it's because the batch size isn't defined
        attn_size = encoder_states.get_shape(
        )[2].value  # if this line fails, it's because the attention length isn't defined

        # Reshape encoder_states (need to insert a dim)
        encoder_states = tf.expand_dims(
            encoder_states,
            axis=2)  # now is shape (batch_size, attn_len, 1, attn_size)

        # To calculate attention, we calculate
        #   v^T tanh(W_h h_i + W_s s_t + b_attn)
        # where h_i is an encoder state, and s_t a decoder state.
        # attn_vec_size is the length of the vectors v, b_attn, (W_h h_i) and (W_s s_t).
        # We set it to be equal to the size of the encoder states.
        attention_vec_size = attn_size

        # Get the weight matrix W_h and apply it to each encoder state to get (W_h h_i), the encoder features
        W_h = variable_scope.get_variable(
            "W_h", [1, 1, attn_size, attention_vec_size])
        encoder_features = nn_ops.conv2d(
            encoder_states, W_h, [1, 1, 1, 1],
            "SAME")  # shape (batch_size,attn_length,1,attention_vec_size)

        # Get the weight vectors v and w_c (w_c is for coverage)
        v = variable_scope.get_variable("v", [attention_vec_size])

        def attention(decoder_state):
            """Calculate the context vector and attention distribution from the decoder state.

            Args:
              decoder_state: state of the decoder

            Returns:
              context_vector: weighted sum of encoder_states
              attn_dist: attention distribution
            """
            with variable_scope.variable_scope("Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                decoder_features = linear(
                    decoder_state, attention_vec_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, attention_vec_size)

                def masked_attention(e):
                    """Take softmax of e then apply enc_padding_mask and re-normalize"""
                    attn_dist = nn_ops.softmax(
                        e)  # take softmax. shape (batch_size, attn_length)
                    attn_dist *= enc_padding_mask  # apply mask
                    masked_sums = tf.reduce_sum(attn_dist,
                                                axis=1)  # shape (batch_size)
                    return attn_dist / tf.reshape(masked_sums,
                                                  [-1, 1])  # re-normalize

                # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                e = math_ops.reduce_sum(
                    v * math_ops.tanh(encoder_features + decoder_features),
                    [2, 3])  # calculate e

                # Calculate attention distribution
                attn_dist = masked_attention(e)

                # Calculate the context vector from attn_dist and encoder_states
                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) *
                    encoder_states, [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector,
                                                   [-1, attn_size])

            return context_vector, attn_dist

        def run(ind):
            i = 0
            inp = tf.gather(decoder_inputs, ind)
            ith_state = tf.unstack(tf.gather(initial_state, ind))
            if initial_state_attention:  # true in decode mode
                # Re-calculate the context vector from the previous step so that we can pass it through a linear layer with this step's input to get a modified version of the input
                context_vector, _ = attention(ith_state)

            context_vector = array_ops.zeros([batch_size, attn_size])
            context_vector.set_shape([
                None, attn_size
            ])  # Ensure the second shape of attention vectors is set.

            # tf.logging.info("Adding attention_decoder timestep %i of %i", i, len(decoder_inputs))
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()

            # Merge input and previous attentions into one vector x of the same size as inp
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" %
                                 inp.name)
            x = linear([inp] + [context_vector], input_size, True)

            # Run the decoder RNN cell. cell_output = decoder state
            cell_output, state = cell(x, ith_state)

            # state_list.append(state)

            # Run the attention mechanism.
            #if initial_state_attention:  # always true in decode mode
            with variable_scope.variable_scope(
                    variable_scope.get_variable_scope(), reuse=True
            ):  # you need this because you've already run the initial attention(...) call
                context_vector, attn_dist = attention(state)

            # Calculate p_gen
            if pointer_gen:
                with tf.variable_scope('calculate_pgen'):
                    p_gen = linear([context_vector, state.c, state.h, x], 1,
                                   True)  # a scalar
                    p_gen = tf.sigmoid(p_gen)
                    # p_gens.append(p_gen)

            # Concatenate the cell_output (= decoder state) and the context vector, and pass them through a linear layer
            # This is V[s_t, h*_t] + b in the paper
            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([cell_output] + [context_vector],
                                cell.output_size, True)
            # outputs.append(output)
            return tf.stack(state), attn_dist, p_gen, output

        states, attn_dists, p_gens, outputs = tf.map_fn(
            run,
            tf.range(len(decoder_inputs)),
            dtype=(tf.float32, tf.float32, tf.float32, tf.float32))

        # for i, inp in enumerate(decoder_inputs):
        #     state, attn_dist, p_gen, output = run(i, inp)
        #     state_list.append(state)
        #     attn_dists.append(attn_dist)
        #     p_gens.append(p_gen)
        #     outputs.append(output)
        # state = state_list

        return tf.unstack(outputs), [
            tf.contrib.rnn.LSTMStateTuple(elem[0], elem[1])
            for elem in tf.unstack(states)
        ], tf.unstack(attn_dists), tf.unstack(p_gens)
Example #49
0
    def __init__(self,
                 linear_size,
                 num_layers,
                 residual,
                 batch_norm,
                 max_norm,
                 batch_size,
                 learning_rate,
                 summaries_dir,
                 predict_14=False,
                 dtype=tf.float32):
        """Creates the linear + relu model

    Args
      linear_size: integer. number of units in each layer of the model
      num_layers: integer. number of bilinear blocks in the model
      residual: boolean. Whether to add residual connections
      batch_norm: boolean. Whether to use batch normalization
      max_norm: boolean. Whether to clip weights to a norm of 1
      batch_size: integer. The size of the batches used during training
      learning_rate: float. Learning rate to start with
      summaries_dir: String. Directory where to log progress
      predict_14: boolean. Whether to predict 14 instead of 17 joints
      dtype: the data type to use to store internal variables
    """

        # There are in total 17 joints in H3.6M and 16 in MPII (and therefore in stacked
        # hourglass detections). We settled with 16 joints in 2d just to make models
        # compatible (e.g. you can train on ground truth 2d and test on SH detections).
        # This does not seem to have an effect on prediction performance.
        self.HUMAN_2D_SIZE = 16 * 2

        # In 3d all the predictions are zero-centered around the root (hip) joint, so
        # we actually predict only 16 joints. The error is still computed over 17 joints,
        # because if one uses, e.g. Procrustes alignment, there is still error in the
        # hip to account for!
        # There is also an option to predict only 14 joints, which makes our results
        # directly comparable to those in https://arxiv.org/pdf/1611.09010.pdf
        self.HUMAN_3D_SIZE = 14 * 3 if predict_14 else 16 * 3

        self.input_size = self.HUMAN_2D_SIZE
        self.output_size = self.HUMAN_3D_SIZE

        self.isTraining = tf.placeholder(tf.bool, name="isTrainingflag")
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")

        # Summary writers for train and test runs
        self.train_writer = tf.summary.FileWriter(
            os.path.join(summaries_dir, 'train'))
        self.test_writer = tf.summary.FileWriter(
            os.path.join(summaries_dir, 'test'))

        self.linear_size = linear_size
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=dtype,
                                         name="learning_rate")
        self.global_step = tf.Variable(0, trainable=False, name="global_step")
        decay_steps = 100000  # empirical
        decay_rate = 0.96  # empirical
        self.learning_rate = tf.train.exponential_decay(
            self.learning_rate, self.global_step, decay_steps, decay_rate)

        # === Transform the inputs ===
        with vs.variable_scope("inputs"):

            # in=2d poses, out=3d poses
            enc_in = tf.placeholder(dtype,
                                    shape=[None, self.input_size],
                                    name="enc_in")
            dec_out = tf.placeholder(dtype,
                                     shape=[None, self.output_size],
                                     name="dec_out")

            self.encoder_inputs = enc_in
            self.decoder_outputs = dec_out

        # === Create the linear + relu combos ===
        with vs.variable_scope("linear_model"):

            # === First layer, brings dimensionality up to linear_size ===
            w1 = tf.get_variable(name="w1",
                                 initializer=kaiming,
                                 shape=[self.HUMAN_2D_SIZE, linear_size],
                                 dtype=dtype)
            b1 = tf.get_variable(name="b1",
                                 initializer=kaiming,
                                 shape=[linear_size],
                                 dtype=dtype)
            w1 = tf.clip_by_norm(w1, 1) if max_norm else w1
            y3 = tf.matmul(enc_in, w1) + b1

            if batch_norm:
                y3 = tf.layers.batch_normalization(y3,
                                                   training=self.isTraining,
                                                   name="batch_normalization")
            y3 = tf.nn.relu(y3)
            y3 = tf.nn.dropout(y3, self.dropout_keep_prob)

            # === Create multiple bi-linear layers ===
            for idx in range(num_layers):
                y3 = self.two_linear(y3, linear_size, residual,
                                     self.dropout_keep_prob, max_norm,
                                     batch_norm, dtype, idx)

            # === Last linear layer has HUMAN_3D_SIZE in output ===
            w4 = tf.get_variable(name="w4",
                                 initializer=kaiming,
                                 shape=[linear_size, self.HUMAN_3D_SIZE],
                                 dtype=dtype)
            b4 = tf.get_variable(name="b4",
                                 initializer=kaiming,
                                 shape=[self.HUMAN_3D_SIZE],
                                 dtype=dtype)
            w4 = tf.clip_by_norm(w4, 1) if max_norm else w4
            y = tf.matmul(y3, w4) + b4
            # === End linear model ===

        # Store the outputs here
        self.outputs = y
        self.loss = tf.reduce_mean(tf.square(y - dec_out))
        self.loss_summary = tf.summary.scalar('loss/loss', self.loss)

        # To keep track of the loss in mm
        self.err_mm = tf.placeholder(tf.float32, name="error_mm")
        self.err_mm_summary = tf.summary.scalar("loss/error_mm", self.err_mm)

        # Gradients and update operation for training the model.
        opt = tf.train.AdamOptimizer(self.learning_rate)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        with tf.control_dependencies(update_ops):

            # Update all the trainable parameters
            gradients = opt.compute_gradients(self.loss)
            self.gradients = [[] if i == None else i for i in gradients]
            self.updates = opt.apply_gradients(gradients,
                                               global_step=self.global_step)

        # Keep track of the learning rate
        self.learning_rate_summary = tf.summary.scalar(
            'learning_rate/learning_rate', self.learning_rate)

        # To save the model
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
Example #50
0
	def __init__(self, config, mode, forward_only, feed_previous, cell_mode=None, no_previous=False, max_cell_length=None):
		super(BASIC_RNNModel, self).__init__(config, mode, cell_mode=cell_mode, no_previous=no_previous, max_cell_length=max_cell_length)

		# make sampled softmax
		output_projection = None
		softmax_loss_function = None
		if self.num_samples and self.num_samples < self.num_output_symbols:
			w = tf.get_variable("proj_w", [self.cell_units, self.num_output_symbols])
			w_t = tf.transpose(w)
			b = tf.get_variable("proj_b", [self.num_output_symbols])
			output_projection = (w, b)
			def sampled_loss(labels, inputs):
				labels = tf.reshape(labels, [-1, 1])
				local_w_t = tf.cast(w_t, tf.float32)
				local_b = tf.cast(b, tf.float32)
				local_inputs = tf.cast(inputs, tf.float32)
				return tf.nn.sampled_softmax_loss(
					weights=local_w_t,
					biases=local_b,
					labels=labels,
					inputs=local_inputs,
					num_sampled=self.num_samples,
					num_classes=self.num_output_symbols)
			softmax_loss_function = sampled_loss

		# one to one learning task
		def Net(inputs,feed_previous,initial_state=None):
			return model_utils.basic_rnn(
				inputs, self.cell,
				num_input_symbols=self.num_input_symbols,
				num_output_symbols=self.num_output_symbols,
				embedding_size=self.embedding_size,
				output_projection=output_projection,
				feed_previous=feed_previous,
				initial_state=initial_state,
				not_shared=self.not_shared)

		with vs.variable_scope('SRN_Model'):
			self.outputs, self.losses, self.state = model_utils.make_model(
					self.inputs, self.targets, self.weights,
					lambda x,y: Net(x,
								feed_previous=feed_previous,
								initial_state=y),
					softmax_loss_function=softmax_loss_function,
					initial_state=self.initial_state)
			
			if forward_only:
				if output_projection is not None:
					for b,output in enumerate(self.outputs):
						self.outputs[b] = tf.matmul(output, output_projection[0]) + output_projection[1]

		# Gradients and SGD update operation for training the model.
		params = tf.compat.v1.trainable_variables()
		if not forward_only:
			self.gradient_norms = []
			self.updates = []
			#opt = tf.train.AdamOptimizer(self.learning_rate)
			opt = tf.compat.v1.train.AdagradOptimizer(self.learning_rate)
			#opt = tf.train.GradientDescentOptimizer(self.learning_rate)
			gradients = tf.gradients(self.losses, params)
			clipped_gradients, norm = tf.clip_by_global_norm(gradients, 
													self.max_gradient_norm)
			self.gradient_norms.append(norm)
			self.updates.append(opt.apply_gradients( 
				zip(clipped_gradients, params), global_step=self.global_step))
		self.saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=self.max_checkpoints_to_keep)
Example #51
0
    def build_graph(self, question_hiddens, question_hiddens_mask,
                    context_hiddens, context_hiddens_mask):
        #(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
        #     q_j,            q_j mask,       c_i        , c_i mask
        """
        context_hiddens attend to question_hiddens.
        For each context_hiddens, return an attention distribution and an attention output vector.

        Inputs:
          question_hiddens: Tensor shape (batch_size, question_len, 2h).
          question_hiddens_mask: Tensor shape (batch_size, question_len).
            1s where there's real input, 0s where there's padding
          context_hiddens: Tensor shape (batch_size, context_len, 2h)
          context_hiddens_mask: Tensor shape (batch_size, context_len).

        Outputs:
          attn_dist: Tensor shape (batch_size, context_len, question_len).
            For each context_hiddens, the distribution should sum to 1,
            and should be 0 in the value locations that correspond to padding.
          output: Tensor shape (batch_size, context_len, hidden_size).
            This is the attention output; the weighted sum of the question_hiddens
            (using the attention distribution as weights).
        """
        with vs.variable_scope("BiDAFAttn"):

            # *******************************
            # *** Build similarity matrix ***
            # *******************************

            with vs.variable_scope("Similarity_matrix"):

                W_sim1 = tf.get_variable(
                    "W_sim1_cn", shape=[self.context_hiddens_vec_size,
                                        1])  # shape (2h, 1)
                W_sim2 = tf.get_variable(
                    "W_sim2_qn", shape=[self.context_hiddens_vec_size, 1])
                W_sim3 = tf.get_variable(
                    "W_sim3_cq", shape=[self.context_hiddens_vec_size, 1])

                question_len = question_hiddens.get_shape().as_list()[1]
                context_len = context_hiddens.get_shape().as_list()[1]

                # sous-matrice (W_sim1 . context_repeat) // (?,N,M) = (?, context_len, question_len)
                W_sim1_context = tf.tensordot(
                    W_sim1,
                    context_hiddens,
                    axes=[[0], [2]],
                    name="W_sim1_dot_cn")  # (1, ?, context_len)
                W_sim1_context = tf.reshape(
                    W_sim1_context,
                    [-1, context_len, 1])  # (?, context_len, 1)
                W_sim1_context_repeat = tf.tile(
                    W_sim1_context,
                    tf.constant([1, 1, question_len
                                 ]))  # (?, context_len, question_len)

                # sous-matrice (W_sim2 . question_repeat) // (?,N,M) = (?, context_len, question_len)
                W_sim2_question = tf.tensordot(
                    W_sim2,
                    question_hiddens,
                    axes=[[0], [2]],
                    name="W_sim2_dot_qn")  # (1, ?, question_len)
                W_sim2_question = tf.reshape(
                    W_sim2_question,
                    [-1, question_len, 1])  # (?, question_len, 1)
                W_sim2_question_repeat = tf.tile(
                    W_sim2_question,
                    tf.constant([1, context_len,
                                 1]))  # (?, context_len*question_len, 1)
                W_sim2_question_repeat = tf.reshape(
                    W_sim2_question_repeat,
                    [-1, context_len, question_len
                     ])  # (?, context_len, question_len)

                # sous-matrice (W_sim3 . context_hiddens o question_hiddens) // (?,N,M) = (?, context_len, question_len)
                W_sim3_times_context = tf.multiply(tf.tile(
                    tf.transpose(W_sim3), tf.constant([context_len, 1])),
                                                   context_hiddens,
                                                   name="W_sim3_o_cn")
                W_sim3_context_question = tf.matmul(
                    W_sim3_times_context,
                    tf.transpose(question_hiddens, perm=[0, 2, 1]),
                    name="W_sim3_x_qn")  # (?, context_len, question_len)

                sim_matrix = tf.add_n(
                    [
                        W_sim1_context_repeat, W_sim2_question_repeat,
                        W_sim3_context_question
                    ],
                    name="sim_matrix")  # shape (?, context_len, question_len)

            # ****************************************
            # *** Calculate attention distribution ***
            # ****************************************

            # *** C2Q Attention ***
            with vs.variable_scope("C2Q_Attention"):
                c2q_attn_logits = sim_matrix  # shape (batch_size, context_len, question_len)
                c2q_attn_logits_mask = tf.expand_dims(
                    question_hiddens_mask,
                    1)  # shape (batch_size, 1, question_len)
                _, c2q_attn_dist = masked_softmax(
                    c2q_attn_logits, c2q_attn_logits_mask, 2
                )  # shape (batch_size, context_len, question_len). take softmax over question_hiddens

                # Use attention distribution to take weighted sum of question_hiddens
                c2q_output = tf.matmul(
                    c2q_attn_dist,
                    question_hiddens)  # shape (batch_size, context_len, 2h)

                # Apply dropout
                c2q_output = tf.nn.dropout(c2q_output, self.keep_prob)

            # *** Q2C Attention ***
            with vs.variable_scope("Q2C_Attention"):
                # m_i
                q2c_attn_logits = tf.reduce_max(
                    sim_matrix, axis=2,
                    keep_dims=True)  # shape (batch_size, context_len, 1)
                q2c_attn_logits_mask = tf.expand_dims(
                    context_hiddens_mask,
                    2)  # shape (batch_size, context_len, 1)
                # beta
                _, q2c_attn_dist = masked_softmax(
                    q2c_attn_logits, q2c_attn_logits_mask, 1
                )  # shape (batch_size, context_len, 1). take softmax over question_hiddens
                q2c_output = tf.reduce_sum(tf.multiply(q2c_attn_dist,
                                                       context_hiddens),
                                           axis=1)  # shape (batch_size, 2h)
                q2c_output = tf.expand_dims(
                    q2c_output, axis=1)  # shape (batch_size, 1, 2h)
                # Apply dropout
                q2c_output = tf.nn.dropout(
                    q2c_output, self.keep_prob)  # shape (batch_size, 1, 2h)

            return c2q_attn_dist, c2q_output, q2c_attn_dist, q2c_output
Example #52
0
    def testInitFromPartitionVar(self):
        checkpoint_dir = self.get_temp_dir()
        with self.test_session() as session:
            v1 = _create_partition_checkpoints(session, checkpoint_dir)

        # New graph and session.
        with ops.Graph().as_default() as g:
            with self.test_session(graph=g) as session:
                with variable_scope.variable_scope("some_scope"):
                    my1 = variable_scope.get_variable(
                        name="my1",
                        shape=[100, 100],
                        initializer=init_ops.zeros_initializer(),
                        partitioner=partitioned_variables.
                        min_max_variable_partitioner(max_partitions=5,
                                                     axis=0,
                                                     min_slice_size=8 << 10))
                    my1_var_list = my1._get_variable_list()
                # Create another variable with different partitions than the variable in
                # the checkpoint.
                with variable_scope.variable_scope("some_other_scope"):
                    my2 = variable_scope.get_variable(
                        name="var1",
                        shape=[100, 100],
                        initializer=init_ops.zeros_initializer(),
                        partitioner=partitioned_variables.
                        min_max_variable_partitioner(max_partitions=5,
                                                     axis=0,
                                                     min_slice_size=16 << 10))
                    my2_var_list = my2._get_variable_list()

                checkpoint_utils.init_from_checkpoint(
                    checkpoint_dir, {
                        "scope/var1": "some_scope/my1",
                        "scope/": "some_other_scope/"
                    })

                session.run(variables.global_variables_initializer())
                my1_values = session.run(my1_var_list)
                self.assertAllEqual(my1_values, v1)
                my2_values = session.run(my2_var_list)
                # Verify we created different number of partitions.
                self.assertNotEquals(len(my2_values), len(v1))
                # Verify the values were correctly initialized inspite of different
                # partitions.
                full_my2_values = np.concatenate(my2_values, axis=0)
                full_v1_values = np.concatenate(v1, axis=0)
                self.assertAllEqual(full_my2_values, full_v1_values)

        # New graph and session.
        with ops.Graph().as_default() as g:
            with self.test_session(graph=g) as session:
                with variable_scope.variable_scope("some_scope"):
                    my1 = variable_scope.get_variable(
                        name="my1",
                        shape=[100, 100],
                        initializer=init_ops.truncated_normal_initializer(0.5),
                        partitioner=partitioned_variables.
                        min_max_variable_partitioner(max_partitions=5,
                                                     axis=0,
                                                     min_slice_size=8 << 10))
                    my1_var_list = my1._get_variable_list()

                checkpoint_utils.init_from_checkpoint(
                    checkpoint_dir, {
                        "scope/var1": my1_var_list,
                    })

                session.run(variables.global_variables_initializer())
                my1_values = session.run(my1_var_list)
                self.assertAllEqual(my1_values, v1)
Example #53
0
def rnn(cell,
        inputs,
        initial_state=None,
        dtype=None,
        sequence_length=None,
        scope=None):
    """Creates a recurrent neural network specified by RNNCell "cell".

  The simplest form of RNN network generated is:
    state = cell.zero_state(...)
    outputs = []
    states = []
    for input_ in inputs:
      output, state = cell(input_, state)
      outputs.append(output)
      states.append(state)
    return (outputs, states)

  However, a few other options are available:

  An initial state can be provided.
  If sequence_length is provided, dynamic calculation is performed.

  Dynamic calculation returns, at time t:
    (t >= max(sequence_length)
        ? (zeros(output_shape), zeros(state_shape))
        : cell(input, state)

  Thus saving computational time when unrolling past the max sequence length.

  Args:
    cell: An instance of RNNCell.
    inputs: A length T list of inputs, each a tensor of shape
      [batch_size, cell.input_size].
    initial_state: (optional) An initial state for the RNN.  This must be
      a tensor of appropriate type and shape [batch_size x cell.state_size].
    dtype: (optional) The data type for the initial state.  Required if
      initial_state is not provided.
    sequence_length: An int64 vector (tensor) size [batch_size].
    scope: VariableScope for the created subgraph; defaults to "RNN".

  Returns:
    A pair (outputs, states) where:
      outputs is a length T list of outputs (one for each input)
      states is a length T list of states (one state following each input)

  Raises:
    TypeError: If "cell" is not an instance of RNNCell.
    ValueError: If inputs is None or an empty list.
  """

    if not isinstance(cell, rnn_cell.RNNCell):
        raise TypeError("cell must be an instance of RNNCell")
    if not isinstance(inputs, list):
        raise TypeError("inputs must be a list")
    if not inputs:
        raise ValueError("inputs must not be empty")

    outputs = []
    states = []
    with vs.variable_scope(scope or "RNN"):
        batch_size = array_ops.shape(inputs[0])[0]
        if initial_state is not None:
            state = initial_state
        else:
            if not dtype:
                raise ValueError(
                    "If no initial_state is provided, dtype must be.")
            state = cell.zero_state(batch_size, dtype)

        if sequence_length:  # Prepare variables
            zero_output_state = (array_ops.zeros(
                array_ops.pack([batch_size, cell.output_size]),
                inputs[0].dtype),
                                 array_ops.zeros(
                                     array_ops.pack(
                                         [batch_size, cell.state_size]),
                                     state.dtype))
            max_sequence_length = math_ops.reduce_max(sequence_length)

        for time, input_ in enumerate(inputs):
            if time > 0: vs.get_variable_scope().reuse_variables()

            # pylint: disable=cell-var-from-loop
            def output_state():
                return cell(input_, state)

            # pylint: enable=cell-var-from-loop
            if sequence_length:
                (output,
                 state) = control_flow_ops.cond(time >= max_sequence_length,
                                                lambda: zero_output_state,
                                                output_state)
            else:
                (output, state) = output_state()

            outputs.append(output)
            states.append(state)

        return (outputs, states)
Example #54
0
    def two_linear(self, xin, linear_size, residual, dropout_keep_prob,
                   max_norm, batch_norm, dtype, idx):
        """
    Make a bi-linear block with optional residual connection

    Args
      xin: the batch that enters the block
      linear_size: integer. The size of the linear units
      residual: boolean. Whether to add a residual connection
      dropout_keep_prob: float [0,1]. Probability of dropping something out
      max_norm: boolean. Whether to clip weights to 1-norm
      batch_norm: boolean. Whether to do batch normalization
      dtype: type of the weigths. Usually tf.float32
      idx: integer. Number of layer (for naming/scoping)
    Returns
      y: the batch after it leaves the block
    """

        with vs.variable_scope("two_linear_" + str(idx)) as scope:

            input_size = int(xin.get_shape()[1])

            # Linear 1
            w2 = tf.get_variable(name="w2_" + str(idx),
                                 initializer=kaiming,
                                 shape=[input_size, linear_size],
                                 dtype=dtype)
            b2 = tf.get_variable(name="b2_" + str(idx),
                                 initializer=kaiming,
                                 shape=[linear_size],
                                 dtype=dtype)
            w2 = tf.clip_by_norm(w2, 1) if max_norm else w2
            y = tf.matmul(xin, w2) + b2
            if batch_norm:
                y = tf.layers.batch_normalization(y,
                                                  training=self.isTraining,
                                                  name="batch_normalization1" +
                                                  str(idx))

            y = tf.nn.relu(y)
            y = tf.nn.dropout(y, dropout_keep_prob)

            # Linear 2
            w3 = tf.get_variable(name="w3_" + str(idx),
                                 initializer=kaiming,
                                 shape=[linear_size, linear_size],
                                 dtype=dtype)
            b3 = tf.get_variable(name="b3_" + str(idx),
                                 initializer=kaiming,
                                 shape=[linear_size],
                                 dtype=dtype)
            w3 = tf.clip_by_norm(w3, 1) if max_norm else w3
            y = tf.matmul(y, w3) + b3

            if batch_norm:
                y = tf.layers.batch_normalization(y,
                                                  training=self.isTraining,
                                                  name="batch_normalization2" +
                                                  str(idx))

            y = tf.nn.relu(y)
            y = tf.nn.dropout(y, dropout_keep_prob)

            # Residual every 2 blocks
            y = (xin + y) if residual else y

        return y
Example #55
0
    def build_graph(self, values, values_mask):
        """
        Keys attend to values.
        For each key, return an attention distribution and an attention output vector.

        Inputs:
          values: Tensor shape (batch_size, num_values, value_vec_size).
          values_mask: Tensor shape (batch_size, num_values).
            1s where there's real input, 0s where there's padding
          keys: Tensor shape (batch_size, num_keys, value_vec_size)

        Outputs:
          attn_dist: Tensor shape (batch_size, num_keys, num_values).
            For each key, the distribution should sum to 1,
            and should be 0 in the value locations that correspond to padding.
          output: Tensor shape (batch_size, num_keys, hidden_size).
            This is the attention output; the weighted sum of the values
            (using the attention distribution as weights).
        """
        with vs.variable_scope("DotAttn"):

            if self.advanced_dot_attn:
                v1 = tf.layers.dense(values,
                                     self.value_vec_size,
                                     activation=tf.nn.relu,
                                     use_bias=False,
                                     name="W1")
                v2 = tf.layers.dense(values,
                                     self.value_vec_size,
                                     activation=tf.nn.relu,
                                     use_bias=False,
                                     name="W2")
            else:
                v1 = tf.layers.dense(values,
                                     self.value_vec_size,
                                     use_bias=False,
                                     name="W1")
                v2 = tf.layers.dense(values,
                                     self.value_vec_size,
                                     use_bias=False,
                                     name="W2")

            if self.advanced_dot_attn:
                self_attn_logits = tf.matmul(
                    v1,
                    tf.transpose(v2, [0, 2, 1]) / np.sqrt(self.value_vec_size))
            else:
                self_attn_logits = tf.matmul(v1, tf.transpose(v2, [0, 2, 1]))

            self_attn_logits_mask = tf.expand_dims(
                values_mask, 1)  # (batch_size, 1, num_values)
            _, self_attn_dist = masked_softmax(
                self_attn_logits, self_attn_logits_mask,
                2)  # (batch_size, num_values, num_values)

            # Use attention distribution to take weighted sum of values
            output = tf.matmul(
                self_attn_dist,
                values)  # shape (batch_size, num_values, value_vec_size)

            # Apply dropout
            output = tf.nn.dropout(output, self.keep_prob)

            return self_attn_dist, output
Example #56
0
  def call(self, inputs, state):
    """Run one step of LSTM.

    Args:
      inputs: input Tensor, 2D, batch x num_units.
      state: if `state_is_tuple` is False, this must be a state Tensor,
        `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
        tuple of state Tensors, both `2-D`, with column sizes `c_state` and
        `m_state`.

    Returns:
      A tuple containing:

      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
        LSTM after reading `inputs` when previous state was `state`.
        Here output_dim is:
           num_proj if num_proj was set,
           num_units otherwise.
      - Tensor(s) representing the new state of LSTM after reading `inputs` when
        the previous state was `state`.  Same type and shape(s) as `state`.

    Raises:
      ValueError: If input size cannot be inferred from inputs via
        static shape inference.
    """
    num_proj = self._num_units if self._num_proj is None else self._num_proj
    sigmoid = math_ops.sigmoid

    if self._state_is_tuple:
      (c_prev, m_prev) = state
    else:
      c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
      m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])

    dtype = inputs.dtype
    input_size = inputs.get_shape().with_rank(2)[1]
    if input_size.value is None:
      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
    scope = vs.get_variable_scope()
    with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
      if self._num_unit_shards is not None:
        unit_scope.set_partitioner(
            partitioned_variables.fixed_size_partitioner(
                self._num_unit_shards))
      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
      lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True)
      i, j, f, o = array_ops.split(
          value=lstm_matrix, num_or_size_splits=4, axis=1)
      # Diagonal connections
      if self._use_peepholes:
        with vs.variable_scope(unit_scope) as projection_scope:
          if self._num_unit_shards is not None:
            projection_scope.set_partitioner(None)
          w_f_diag = vs.get_variable(
              "w_f_diag", shape=[self._num_units], dtype=dtype)
          w_i_diag = vs.get_variable(
              "w_i_diag", shape=[self._num_units], dtype=dtype)
          w_o_diag = vs.get_variable(
              "w_o_diag", shape=[self._num_units], dtype=dtype)

      if self._use_peepholes:
        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
             sigmoid(i + w_i_diag * c_prev) * self._activation(j))
      else:
        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
             self._activation(j))

      if self._cell_clip is not None:
        # pylint: disable=invalid-unary-operand-type
        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
        # pylint: enable=invalid-unary-operand-type
      if self._use_peepholes:
        m = sigmoid(o + w_o_diag * c) * self._activation(c)
      else:
        m = sigmoid(o) * self._activation(c)

      if self._num_proj is not None:
        with vs.variable_scope("projection") as proj_scope:
          if self._num_proj_shards is not None:
            proj_scope.set_partitioner(
                partitioned_variables.fixed_size_partitioner(
                    self._num_proj_shards))
          m = _linear(m, self._num_proj, bias=False)

        if self._proj_clip is not None:
          # pylint: disable=invalid-unary-operand-type
          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
          # pylint: enable=invalid-unary-operand-type

    new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
                 array_ops.concat([c, m], 1))
    return m, new_state, f
Example #57
0
    def build_graph(self, questions, questions_mask, contexts, contexts_mask):

        with vs.variable_scope("AnswerPointerLayerStart"):

            ###### start answer pooling ######

            Vrq = tf.get_variable(
                "v_answer_pooling",
                shape=[1, self.value_vec_size],
                initializer=tf.contrib.layers.xavier_initializer())

            questions_input_lens = tf.reduce_sum(questions_mask,
                                                 reduction_indices=1)

            with vs.variable_scope("RNNLayer1"):
                (self.fw_out1,
                 self.bw_out1), _ = tf.nn.bidirectional_dynamic_rnn(
                     self.rnn_cell_fw1,
                     self.rnn_cell_bw1,
                     questions,
                     questions_input_lens,
                     dtype=tf.float32)
            # (batch_size, question_len, hidden_size * 4)
            questions_out1 = tf.concat([self.fw_out1, self.bw_out1], 2)

            with vs.variable_scope("RNNLayer2"):
                (self.fw_out2,
                 self.bw_out2), _ = tf.nn.bidirectional_dynamic_rnn(
                     self.rnn_cell_fw2,
                     self.rnn_cell_bw2,
                     questions_out1,
                     questions_input_lens,
                     dtype=tf.float32)
            # (batch_size, question_len, hidden_size * 8)
            questions_out2 = tf.concat([self.fw_out2, self.bw_out2], 2)

            with vs.variable_scope("RNNLayer3"):
                (self.fw_out3,
                 self.bw_out3), _ = tf.nn.bidirectional_dynamic_rnn(
                     self.rnn_cell_fw3,
                     self.rnn_cell_bw3,
                     questions_out2,
                     questions_input_lens,
                     dtype=tf.float32)
            # (batch_size, question_len, hidden_size * 16)
            questions_out3 = tf.concat([self.fw_out3, self.bw_out3], 2)

            # (1, value_vec_size)
            k = tf.layers.dense(Vrq,
                                self.value_vec_size,
                                activation=tf.nn.relu,
                                use_bias=False,
                                name="Wvrq")
            # (batch_size, question_len, value_vec_size)
            v = tf.layers.dense(questions_out3,
                                self.value_vec_size,
                                activation=tf.nn.relu,
                                use_bias=False,
                                name="Wv")

            # (1, 1, value_vec_size)
            expanded_k = tf.expand_dims(k, 0)
            # (batch_size, question_len, value_vec_size)
            attn_logits_temp = tf.nn.tanh(expanded_k + v)
            # (batch_size, question_len, 1)
            attn_logits_projected = tf.layers.dense(attn_logits_temp,
                                                    1,
                                                    use_bias=False)
            # (batch_size, 1, question_len)
            attn_logits = tf.transpose(attn_logits_projected, [0, 2, 1])

            attn_logits_mask = tf.expand_dims(
                questions_mask, 1)  # shape (batch_size, 1, question_len)
            _, attn_dist = masked_softmax(
                attn_logits, attn_logits_mask,
                2)  # shape (batch_size, 1, question_len)

            # (batch_size, 1, value_vec_size)
            rQ = tf.matmul(attn_dist, v)

            ###### end answer pooling ######

            # (batch_size, 1, value_vec_size)
            k1 = tf.layers.dense(rQ,
                                 self.value_vec_size,
                                 activation=tf.nn.relu,
                                 use_bias=False,
                                 name="Wrq")
            #print "k1 shape: " + str(k1.get_shape())
            # (batch_size, context_len, value_vec_size)
            v1 = tf.layers.dense(contexts,
                                 self.value_vec_size,
                                 activation=tf.nn.relu,
                                 use_bias=False,
                                 name="Wp")
            #print "v1 shape: " + str(v1.get_shape())

            # (batch_size, context_len, value_vec_size)
            attn_logits1_temp = tf.tanh(k1 + v1)
            #print "attn_logits1_temp shape: " + str(attn_logits1_temp.get_shape())
            # (batch_size, context_len, 1)
            attn_logits1_projected = tf.layers.dense(attn_logits1_temp,
                                                     1,
                                                     use_bias=False)
            #print "attn_logits1_projected shape: " + str(attn_logits1_projected.get_shape())
            # (batch_size, context_len)
            squeezed_attn_logits1 = tf.squeeze(attn_logits1_projected,
                                               axis=[2])
            #print "squeezed_attn_logits1 shape: " + str(squeezed_attn_logits1.get_shape())
            # (batch_size, context_len)
            masked_logits1, prob_dist = masked_softmax(squeezed_attn_logits1,
                                                       contexts_mask, 1)

            return rQ, masked_logits1, prob_dist
Example #58
0
def bidirectional_rnn(cell_fw,
                      cell_bw,
                      inputs,
                      initial_state_fw=None,
                      initial_state_bw=None,
                      dtype=None,
                      sequence_length=None,
                      scope=None):
    """Creates a bidirectional recurrent neural network.

  Similar to the unidirectional case above (rnn) but takes input and builds
  independent forward and backward RNNs with the final forward and backward
  outputs depth-concatenated, such that the output will have the format
  [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of
  forward and backward cell must match. The initial state for both directions
  is zero by default (but can be set optionally) and no intermediate states are
  ever returned -- the network is fully unrolled for the given (passed in)
  length(s) of the sequence(s) or completely unrolled if length(s) is not given.

  Args:
    cell_fw: An instance of RNNCell, to be used for forward direction.
    cell_bw: An instance of RNNCell, to be used for backward direction.
    inputs: A length T list of inputs, each a tensor of shape
      [batch_size, cell.input_size].
    initial_state_fw: (optional) An initial state for the forward RNN.
      This must be a tensor of appropriate type and shape
      [batch_size x cell.state_size].
    initial_state_bw: (optional) Same as for initial_state_fw.
    dtype: (optional) The data type for the initial state.  Required if either
      of the initial states are not provided.
    sequence_length: (optional) An int64 vector (tensor) of size [batch_size],
      containing the actual lengths for each of the sequences.
    scope: VariableScope for the created subgraph; defaults to "BiRNN"

  Returns:
    A set of output `Tensors` where:
      outputs is a length T list of outputs (one for each input), which
      are depth-concatenated forward and backward outputs

  Raises:
    TypeError: If "cell_fw" or "cell_bw" is not an instance of RNNCell.
    ValueError: If inputs is None or an empty list.
  """

    if not isinstance(cell_fw, rnn_cell.RNNCell):
        raise TypeError("cell_fw must be an instance of RNNCell")
    if not isinstance(cell_bw, rnn_cell.RNNCell):
        raise TypeError("cell_bw must be an instance of RNNCell")
    if not isinstance(inputs, list):
        raise TypeError("inputs must be a list")
    if not inputs:
        raise ValueError("inputs must not be empty")

    name = scope or "BiRNN"
    # Forward direction
    with vs.variable_scope(name + "_FW"):
        output_fw, _ = rnn(cell_fw, inputs, initial_state_fw, dtype)
    # Backward direction
    with vs.variable_scope(name + "_BW"):
        tmp, _ = rnn(cell_bw, _reverse_seq(inputs, sequence_length),
                     initial_state_bw, dtype)
    output_bw = _reverse_seq(tmp, sequence_length)
    # Concat each of the forward/backward outputs
    outputs = [
        array_ops.concat(1, [fw, bw]) for fw, bw in zip(output_fw, output_bw)
    ]

    return outputs
Example #59
0
    def _set_scope(self, scope=None):
        if self._scope is None:
            if not self._first_parent:
                first_parent = self._first_parent
            else:
                first_parent = self._first_parent()
            if first_parent is None:
                # If we were never added to another Network, or that Network has beed
                # garbage collected before being called, then we're a top-level Network.
                self._finalize_name(
                    # Use False to make sure the value sticks and we don't inherit a
                    # parent if we're added to a network later.
                    parent_network=False)
            if scope is not None:
                raise ValueError(
                    "Networks may not be created with explicit scopes.")
            if first_parent:
                first_parent._set_scope()
                parent_scope = first_parent._scope
            else:
                parent_scope = self._default_parent_variable_scope
            with variable_scope.variable_scope(parent_scope) as parent_vs:
                expected_scope_name = parent_vs.name + "/" + self._name
                if expected_scope_name in self._variable_scope_counts_on_init:
                    raise ValueError((
                        "A Network named '%s' already exists (or a variable_scope was "
                        "created with this name). Names must be unique.") %
                                     (self._name, ))
                # Make sure variables with this prefix will be unique.
                with variable_scope.variable_scope(
                        None, use_resource=True,
                        default_name=self._name) as scope:
                    self._scope = scope
                    scope_name = scope.name
                    suffix_start = scope_name.rfind("/") + 1
                    # rfind is -1 if there is no slash in the string, in which case the
                    # suffix starts at the beginning of the string (there is no prefix).
                    scope_suffix = scope_name[suffix_start:]
                    scope_prefix = scope_name[:suffix_start]
                    if scope_suffix != self._name:
                        raise ValueError((
                            "A Network named '%s' already exists (or a variable_scope was "
                            "created with this name). Names must be unique.") %
                                         (self._name, ))
                    if (first_parent
                            and scope_prefix[:-1] != first_parent.scope_name):
                        raise ValueError((
                            "Network variable names must match a nesting of sub-Network "
                            "names. Expected prefix '%s' from parent network, but got "
                            "'%s' when attempting to create a variable_scope for Network "
                            "'%s'. Likely an explicit variable_scope was inserted into "
                            "the nesting.") % (first_parent.scope_name,
                                               scope_prefix[:-1], self._name))
                    elif not first_parent and scope_prefix:
                        # For the case when this Network is not nested inside any other
                        # Network, but is in a variable_scope. This Network's name takes on
                        # the full variable scope prefix.
                        self._name = scope_name

            for non_network_sublayer in self._non_network_sublayers:
                self._set_scope_for_nonnetwork_sublayer(non_network_sublayer)
Example #60
0
    def build_graph(self, values, values_mask, keys):
        """
        Keys attend to values.
        For each key, return an attention distribution and an attention output vector.

        Inputs:
          values: Tensor shape (batch_size, num_values, value_vec_size).
          values_mask: Tensor shape (batch_size, num_values).
            1s where there's real input, 0s where there's padding
          keys: Tensor shape (batch_size, num_keys, value_vec_size)

        Outputs:
          attn_dist: Tensor shape (batch_size, num_keys, num_values).
            For each key, the distribution should sum to 1,
            and should be 0 in the value locations that correspond to padding.
          output: Tensor shape (batch_size, num_keys, hidden_size).
            This is the attention output; the weighted sum of the values
            (using the attention distribution as weights).
        """
        with vs.variable_scope("BasicAttn"):

            if self.advanced_basic_attn:
                k = tf.layers.dense(keys,
                                    self.key_vec_size,
                                    activation=tf.nn.relu,
                                    use_bias=False,
                                    name="Wk")
                v = tf.layers.dense(values,
                                    self.value_vec_size,
                                    activation=tf.nn.relu,
                                    use_bias=False,
                                    name="Wv")
            else:
                k = keys
                v = values

            # Calculate attention distribution
            values_t = tf.transpose(
                v, perm=[0, 2, 1])  # (batch_size, value_vec_size, num_values)

            if self.advanced_basic_attn:
                attn_logits = tf.matmul(
                    k, values_t / np.sqrt(self.value_vec_size))
            else:
                attn_logits = tf.matmul(
                    k, values_t)  # shape (batch_size, num_keys, num_values)

            attn_logits_mask = tf.expand_dims(
                values_mask, 1)  # shape (batch_size, 1, num_values)
            _, attn_dist = masked_softmax(
                attn_logits, attn_logits_mask, 2
            )  # shape (batch_size, num_keys, num_values). take softmax over values

            # Use attention distribution to take weighted sum of values
            output = tf.matmul(
                attn_dist,
                values)  # shape (batch_size, num_keys, value_vec_size)

            # Apply dropout
            output = tf.nn.dropout(output, self.keep_prob)

            return attn_dist, output