Example #1
0
 def _LossFunc():
   first_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
       array_ops.zeros([28, 100, 28]))
   second_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
       array_ops.zeros([28, 100, 100]))
   return (math_ops.reduce_sum(first_output) +
           math_ops.reduce_sum(second_output))
Example #2
0
  def testMultiRNNCellWithStateTuple(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 2])
        m_bad = array_ops.zeros([1, 4])
        m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2]))

        # Test incorrectness of state
        with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
          core_rnn_cell_impl.MultiRNNCell(
              [core_rnn_cell_impl.GRUCell(2) for _ in range(2)],
              state_is_tuple=True)(x, m_bad)

        _, ml = core_rnn_cell_impl.MultiRNNCell(
            [core_rnn_cell_impl.GRUCell(2) for _ in range(2)],
            state_is_tuple=True)(x, m_good)

        sess.run([variables.global_variables_initializer()])
        res = sess.run(ml, {
            x.name: np.array([[1., 1.]]),
            m_good[0].name: np.array([[0.1, 0.1]]),
            m_good[1].name: np.array([[0.1, 0.1]])
        })

        # The numbers in results were not calculated, this is just a
        # smoke test.  However, these numbers should match those of
        # the test testMultiRNNCell.
        self.assertAllClose(res[0], [[0.175991, 0.175991]])
        self.assertAllClose(res[1], [[0.13248, 0.13248]])
Example #3
0
 def testCoupledInputForgetGateLSTMCell(self):
   with self.test_session() as sess:
     num_units = 2
     state_size = num_units * 2
     batch_size = 3
     input_size = 4
     expected_output = np.array(
         [[0.121753, 0.121753],
          [0.103349, 0.103349],
          [0.100178, 0.100178]],
         dtype=np.float32)
     expected_state = np.array(
         [[0.137523, 0.137523, 0.121753, 0.121753],
          [0.105450, 0.105450, 0.103349, 0.103349],
          [0.100742, 0.100742, 0.100178, 0.100178]],
         dtype=np.float32)
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([batch_size, input_size])
       m = array_ops.zeros([batch_size, state_size])
       output, state = rnn_cell.CoupledInputForgetGateLSTMCell(
           num_units=num_units, forget_bias=1.0)(x, m)
       sess.run([variables.global_variables_initializer()])
       res = sess.run([output, state], {
           x.name:
               np.array([[1., 1., 1., 1.],
                         [2., 2., 2., 2.],
                         [3., 3., 3., 3.]]),
           m.name:
               0.1 * np.ones((batch_size, state_size))
       })
       # This is a smoke test: Only making sure expected values didn't change.
       self.assertEqual(len(res), 2)
       self.assertAllClose(res[0], expected_output)
       self.assertAllClose(res[1], expected_state)
Example #4
0
  def testDifferentShapesGraph(self):
    # Tests that a single kernel instance presented with multiple input shapes
    # does not crash with graph execution.
    with ops.device("gpu:0"):
      layer = cudnn_rnn.CudnnGRU(1, 100)
      layer(array_ops.zeros([28, 100, 100]))

      def _Cond(index, accumulation):
        del accumulation  # unused
        return math_ops.less(index, 4)

      def _Body(index, accumulation):
        layer_input = accumulation[:, :, 10 * (1 + index % 2):]
        output, _ = layer(layer_input)
        return index + 1, accumulation + output

      original_input = array_ops.zeros([28, 100, 100])
      _, accumulation = control_flow_ops.while_loop(_Cond, _Body,
                                                    [0, original_input])
      grad, = gradients.gradients(
          math_ops.reduce_sum(accumulation), (original_input,))
    init_op = variables.global_variables_initializer()
    with self.test_session() as sess:
      sess.run(init_op)
      accumulation_eval, grad_eval = sess.run((accumulation, grad))
      self.assertAllEqual([28, 100, 100], accumulation_eval.shape)
      self.assertAllEqual([28, 100, 100], grad_eval.shape)
Example #5
0
  def testBasicLSTMCellWithDropout(self):

    def _is_close(x, y, digits=4):
      delta = x - y
      return delta < 10**(-digits)

    def _is_close_in(x, items, digits=4):
      for i in items:
        if _is_close(x, i, digits):
          return True
      return False

    keep_prob = 0.5
    c_high = 2.9998924946
    c_low = 0.999983298578
    h_low = 0.761552567265
    h_high = 0.995008519604
    num_units = 5
    allowed_low = [2, 3]

    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "other", initializer=init_ops.constant_initializer(1)):
        x = array_ops.zeros([1, 5])
        c = array_ops.zeros([1, 5])
        h = array_ops.zeros([1, 5])
        state = core_rnn_cell_impl.LSTMStateTuple(c, h)
        cell = rnn_cell.LayerNormBasicLSTMCell(
            num_units, layer_norm=False, dropout_keep_prob=keep_prob)

        g, s = cell(x, state)
        sess.run([variables.global_variables_initializer()])
        res = sess.run([g, s], {
            x.name: np.ones([1, 5]),
            c.name: np.ones([1, 5]),
            h.name: np.ones([1, 5]),
        })

        # Since the returned tensors are of size [1,n]
        # get the first component right now.
        actual_h = res[0][0]
        actual_state_c = res[1].c[0]
        actual_state_h = res[1].h[0]

        # For each item in `c` (the cell inner state) check that
        # it is equal to one of the allowed values `c_high` (not
        # dropped out) or `c_low` (dropped out) and verify that the
        # corresponding item in `h` (the cell activation) is coherent.
        # Count the dropped activations and check that their number is
        # coherent with the dropout probability.
        dropped_count = 0
        self.assertTrue((actual_h == actual_state_h).all())
        for citem, hitem in zip(actual_state_c, actual_state_h):
          self.assertTrue(_is_close_in(citem, [c_low, c_high]))
          if _is_close(citem, c_low):
            self.assertTrue(_is_close(hitem, h_low))
            dropped_count += 1
          elif _is_close(citem, c_high):
            self.assertTrue(_is_close(hitem, h_high))
        self.assertIn(dropped_count, allowed_low)
Example #6
0
  def zero_state(self, batch_size, dtype):
    """Return zero-filled state tensor(s).

    Args:
      batch_size: int, float, or unit Tensor representing the batch size.
      dtype: the data type to use for the state.

    Returns:
      If `state_size` is an int, then the return value is a `2-D` tensor of
      shape `[batch_size x state_size]` filled with zeros.

      If `state_size` is a nested list or tuple, then the return value is
      a nested list or tuple (of the same structure) of `2-D` tensors with
    the shapes `[batch_size x s]` for each s in `state_size`.
    """
    state_size = self.state_size
    if _is_sequence(state_size):
      state_size_flat = _unpacked_state(state_size)
      zeros_flat = [
          array_ops.zeros(array_ops.pack([batch_size, s]), dtype=dtype)
          for s in state_size_flat]
      for s, z in zip(state_size_flat, zeros_flat):
        z.set_shape([None, s])
      zeros = _packed_state(structure=state_size, state=zeros_flat)
    else:
      zeros = array_ops.zeros(
          array_ops.pack([batch_size, state_size]), dtype=dtype)
      zeros.set_shape([None, state_size])

    return zeros
 def testDtype(self):
   with self.test_session():
     d = array_ops.fill([2, 3], 12., name="fill")
     self.assertEqual(d.get_shape(), [2, 3])
     # Test default type for both constant size and dynamic size
     z = array_ops.zeros([2, 3])
     self.assertEqual(z.dtype, dtypes_lib.float32)
     self.assertEqual([2, 3], z.get_shape())
     self.assertAllEqual(z.eval(), np.zeros([2, 3]))
     z = array_ops.zeros(array_ops.shape(d))
     self.assertEqual(z.dtype, dtypes_lib.float32)
     self.assertEqual([2, 3], z.get_shape())
     self.assertAllEqual(z.eval(), np.zeros([2, 3]))
     # Test explicit type control
     for dtype in [
         dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
         dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.int8,
         dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.int64,
         dtypes_lib.bool, dtypes_lib.string
     ]:
       z = array_ops.zeros([2, 3], dtype=dtype)
       self.assertEqual(z.dtype, dtype)
       self.assertEqual([2, 3], z.get_shape())
       z_value = z.eval()
       self.assertFalse(np.any(z_value))
       self.assertEqual((2, 3), z_value.shape)
       z = array_ops.zeros(array_ops.shape(d), dtype=dtype)
       self.assertEqual(z.dtype, dtype)
       self.assertEqual([2, 3], z.get_shape())
       z_value = z.eval()
       self.assertFalse(np.any(z_value))
       self.assertEqual((2, 3), z_value.shape)
  def _TestPostActivationBypassQuantized(self, is_training):
    graph = ops.Graph()
    with graph.as_default():
      batch_size, height, width, depth = 5, 128, 128, 3
      input1 = array_ops.zeros((batch_size, height, width, depth))
      input2 = array_ops.zeros((batch_size, height / 2, width / 2, 32))
      conv = conv2d(
          input1,
          32, [5, 5],
          stride=2,
          padding='SAME',
          weights_initializer=self._WeightInit(0.09),
          activation_fn=array_ops.identity,
          scope='test/test')
      bypass_tensor = math_ops.add(conv, input2, name='test/add')
      _ = array_ops.identity(bypass_tensor, name='test/output')

      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)

      # Ensure that the bypass node is preceded and followed by
      # FakeQuantWithMinMaxVars operations.
      self.assertTrue('FakeQuantWithMinMaxVars' in
                      [c.type for c in bypass_tensor.consumers()])
      self.assertTrue('FakeQuantWithMinMaxVars' in
                      [i.op.type for i in bypass_tensor.op.inputs])
 def testRank3InvalidShape2(self):
   indices = array_ops.zeros([2, 2, 1], dtypes.int32)
   updates = array_ops.zeros([2, 2], dtypes.int32)
   shape = np.array([2, 2, 2])
   with self.assertRaisesWithPredicateMatch(
       ValueError, "The inner \\d+ dimensions of (input|output)\\.shape="):
     self.scatter_nd(indices, updates, shape)
Example #10
0
 def testBasicLSTMCellWithStateTuple(self):
   with self.test_session() as sess:
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 2])
       m0 = array_ops.zeros([1, 4])
       m1 = array_ops.zeros([1, 4])
       cell = rnn_cell_impl.MultiRNNCell(
           [
               rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
               for _ in range(2)
           ],
           state_is_tuple=True)
       g, (out_m0, out_m1) = cell(x, (m0, m1))
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run([g, out_m0, out_m1], {
           x.name: np.array([[1., 1.]]),
           m0.name: 0.1 * np.ones([1, 4]),
           m1.name: 0.1 * np.ones([1, 4])
       })
       self.assertEqual(len(res), 3)
       # The numbers in results were not calculated, this is just a smoke test.
       # Note, however, these values should match the original
       # version having state_is_tuple=False.
       self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
       expected_mem0 = np.array(
           [[0.68967271, 0.68967271, 0.44848421, 0.44848421]])
       expected_mem1 = np.array(
           [[0.39897051, 0.39897051, 0.24024698, 0.24024698]])
       self.assertAllClose(res[1], expected_mem0)
       self.assertAllClose(res[2], expected_mem1)
Example #11
0
 def testLSTMCell(self):
   with self.test_session() as sess:
     num_units = 8
     num_proj = 6
     state_size = num_units + num_proj
     batch_size = 3
     input_size = 2
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([batch_size, input_size])
       m = array_ops.zeros([batch_size, state_size])
       cell = rnn_cell_impl.LSTMCell(
           num_units=num_units,
           num_proj=num_proj,
           forget_bias=1.0,
           state_is_tuple=False)
       output, state = cell(x, m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run([output, state], {
           x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
           m.name: 0.1 * np.ones((batch_size, state_size))
       })
       self.assertEqual(len(res), 2)
       # The numbers in results were not calculated, this is mostly just a
       # smoke test.
       self.assertEqual(res[0].shape, (batch_size, num_proj))
       self.assertEqual(res[1].shape, (batch_size, state_size))
       # Different inputs so different outputs and states
       for i in range(1, batch_size):
         self.assertTrue(
             float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6)
         self.assertTrue(
             float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
Example #12
0
 def testGRUCell(self):
   with self.test_session() as sess:
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 2])
       m = array_ops.zeros([1, 2])
       g, _ = rnn_cell_impl.GRUCell(2)(x, m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run(
           [g], {x.name: np.array([[1., 1.]]),
                 m.name: np.array([[0.1, 0.1]])})
       # Smoke test
       self.assertAllClose(res[0], [[0.175991, 0.175991]])
     with variable_scope.variable_scope(
         "other", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros(
           [1, 3])  # Test GRUCell with input_size != num_units.
       m = array_ops.zeros([1, 2])
       g, _ = rnn_cell_impl.GRUCell(2)(x, m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run(
           [g],
           {x.name: np.array([[1., 1., 1.]]),
            m.name: np.array([[0.1, 0.1]])})
       # Smoke test
       self.assertAllClose(res[0], [[0.156736, 0.156736]])
Example #13
0
  def testBasicLSTMCellStateTupleType(self):
    with self.test_session():
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 2])
        m0 = (array_ops.zeros([1, 2]),) * 2
        m1 = (array_ops.zeros([1, 2]),) * 2
        cell = rnn_cell_impl.MultiRNNCell(
            [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)],
            state_is_tuple=True)
        self.assertTrue(isinstance(cell.state_size, tuple))
        self.assertTrue(
            isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple))
        self.assertTrue(
            isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple))

        # Pass in regular tuples
        _, (out_m0, out_m1) = cell(x, (m0, m1))
        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))

        # Pass in LSTMStateTuples
        variable_scope.get_variable_scope().reuse_variables()
        zero_state = cell.zero_state(1, dtypes.float32)
        self.assertTrue(isinstance(zero_state, tuple))
        self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple))
        self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple))
        _, (out_m0, out_m1) = cell(x, zero_state)
        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
Example #14
0
  def testGridRNNEdgeCasesNoOutput(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          'root', initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 2])
        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)

        # This cell produces no output
        cell = grid_rnn_cell.GridRNNCell(
            num_units=2,
            num_dims=2,
            input_dims=0,
            output_dims=None,
            non_recurrent_dims=0,
            non_recurrent_fn=nn_ops.relu)
        g, s = cell(x, m)
        self.assertEqual(g, ())
        self.assertEqual(s[0].c.get_shape(), (1, 2))
        self.assertEqual(s[0].h.get_shape(), (1, 2))

        sess.run([variables.global_variables_initializer()])
        res_g, res_s = sess.run([g, s], {
            x: np.array([[1., 1.]]),
            m: ((np.array([[0.1, 0.1]]), np.array([[0.1, 0.1]])),)
        })
        self.assertEqual(res_g, ())
        self.assertEqual(res_s[0].c.shape, (1, 2))
        self.assertEqual(res_s[0].h.shape, (1, 2))
Example #15
0
  def testGrid2LSTMCellLegacy(self):
    """Test for legacy case (when state_is_tuple=False)."""
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          'root', initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 3])
        m = array_ops.zeros([1, 8])
        cell = grid_rnn_cell.Grid2LSTMCell(
            2, use_peepholes=True, state_is_tuple=False, output_is_tuple=False)
        self.assertEqual(cell.state_size, 8)

        g, s = cell(x, m)
        self.assertEqual(g.get_shape(), (1, 2))
        self.assertEqual(s.get_shape(), (1, 8))

        sess.run([variables.global_variables_initializer()])
        res = sess.run([g, s], {
            x: np.array([[1., 1., 1.]]),
            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
        })
        self.assertEqual(res[0].shape, (1, 2))
        self.assertEqual(res[1].shape, (1, 8))
        self.assertAllClose(res[0], [[0.95686918, 0.95686918]])
        self.assertAllClose(res[1], [[
            2.41515064, 2.41515064, 0.95686918, 0.95686918, 1.38917875,
            1.49043763, 0.83884692, 0.86036491
        ]])
Example #16
0
  def testGrid2LSTMCellTied(self):
    with self.test_session(use_gpu=False) as sess:
      with variable_scope.variable_scope(
          'root', initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 3])
        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
        cell = grid_rnn_cell.Grid2LSTMCell(2, tied=True, use_peepholes=True)
        self.assertEqual(cell.state_size, ((2, 2), (2, 2)))

        g, s = cell(x, m)
        self.assertEqual(g[0].get_shape(), (1, 2))
        self.assertEqual(s[0].c.get_shape(), (1, 2))
        self.assertEqual(s[0].h.get_shape(), (1, 2))
        self.assertEqual(s[1].c.get_shape(), (1, 2))
        self.assertEqual(s[1].h.get_shape(), (1, 2))

        sess.run([variables.global_variables_initializer()])
        res_g, res_s = sess.run([g, s], {
            x:
                np.array([[1., 1., 1.]]),
            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
        })
        self.assertEqual(res_g[0].shape, (1, 2))
        self.assertEqual(res_s[0].c.shape, (1, 2))
        self.assertEqual(res_s[0].h.shape, (1, 2))
        self.assertEqual(res_s[1].c.shape, (1, 2))
        self.assertEqual(res_s[1].h.shape, (1, 2))

        self.assertAllClose(res_g[0], [[0.95686918, 0.95686918]])
        self.assertAllClose(
            res_s, (([[2.41515064, 2.41515064]], [[0.95686918, 0.95686918]]),
                    ([[1.38917875, 1.49043763]], [[0.83884692, 0.86036491]])))
Example #17
0
  def testGrid2BasicRNNCellTied(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          'root', initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([2, 2])
        m = (array_ops.zeros([2, 2]), array_ops.zeros([2, 2]))
        cell = grid_rnn_cell.Grid2BasicRNNCell(2, tied=True)
        self.assertEqual(cell.state_size, (2, 2))

        g, s = cell(x, m)
        self.assertEqual(g[0].get_shape(), (2, 2))
        self.assertEqual(s[0].get_shape(), (2, 2))
        self.assertEqual(s[1].get_shape(), (2, 2))

        sess.run([variables.global_variables_initializer()])
        res_g, res_s = sess.run([g, s], {
            x:
                np.array([[1., 1.], [2., 2.]]),
            m: (np.array([[0.1, 0.1], [0.2, 0.2]]), np.array([[0.1, 0.1],
                                                              [0.2, 0.2]]))
        })
        self.assertEqual(res_g[0].shape, (2, 2))
        self.assertEqual(res_s[0].shape, (2, 2))
        self.assertEqual(res_s[1].shape, (2, 2))

        self.assertAllClose(res_g, ([[0.94685763, 0.94685763],
                                     [0.99480951, 0.99480951]],))
        self.assertAllClose(
            res_s, ([[0.94685763, 0.94685763], [0.99480951, 0.99480951]],
                    [[0.80049908, 0.80049908], [0.97574311, 0.97574311]]))
  def _possibly_broadcast_batch_shape(self, x):
    """Return 'x', possibly after broadcasting the leading dimensions."""
    # If we have no batch shape, our batch shape broadcasts with everything!
    if self._batch_shape_arg is None:
      return x

    # Static attempt:
    #   If we determine that no broadcast is necessary, pass x through
    #   If we need a broadcast, add to an array of zeros.
    #
    # special_shape is the shape that, when broadcast with x's shape, will give
    # the correct broadcast_shape.  Note that
    #   We have already verified the second to last dimension of self.shape
    #   matches x's shape in assert_compatible_matrix_dimensions.
    #   Also, the final dimension of 'x' can have any shape.
    #   Therefore, the final two dimensions of special_shape are 1's.
    special_shape = self.batch_shape.concatenate([1, 1])
    bshape = array_ops.broadcast_static_shape(x.get_shape(), special_shape)
    if special_shape.is_fully_defined():
      # bshape.is_fully_defined iff special_shape.is_fully_defined.
      if bshape == x.get_shape():
        return x
      # Use the built in broadcasting of addition.
      zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
      return x + zeros

    # Dynamic broadcast:
    #   Always add to an array of zeros, rather than using a "cond", since a
    #   cond would require copying data from GPU --> CPU.
    special_shape = array_ops.concat((self.batch_shape_dynamic(), [1, 1]), 0)
    zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
    return x + zeros
Example #19
0
 def testLSTMBlockCell(self):
   with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 2])
       m0 = array_ops.zeros([1, 2])
       m1 = array_ops.zeros([1, 2])
       m2 = array_ops.zeros([1, 2])
       m3 = array_ops.zeros([1, 2])
       g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell(
           [lstm_ops.LSTMBlockCell(2)
            for _ in range(2)], state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
       sess.run([variables.global_variables_initializer()])
       res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
           x.name: np.array([[1., 1.]]),
           m0.name: 0.1 * np.ones([1, 2]),
           m1.name: 0.1 * np.ones([1, 2]),
           m2.name: 0.1 * np.ones([1, 2]),
           m3.name: 0.1 * np.ones([1, 2])
       })
       self.assertEqual(len(res), 5)
       self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
       # These numbers are from testBasicLSTMCell and only test c/h.
       self.assertAllClose(res[1], [[0.68967271, 0.68967271]])
       self.assertAllClose(res[2], [[0.44848421, 0.44848421]])
       self.assertAllClose(res[3], [[0.39897051, 0.39897051]])
       self.assertAllClose(res[4], [[0.24024698, 0.24024698]])
Example #20
0
  def zero_state(self, batch_size, dtype):
    """Return zero-filled state tensor(s).

    Args:
      batch_size: int, float, or unit Tensor representing the batch size.
      dtype: the data type to use for the state.

    Returns:
      If `state_size` is an int or TensorShape, then the return value is a
      `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.

      If `state_size` is a nested list or tuple, then the return value is
      a nested list or tuple (of the same structure) of `2-D` tensors with
    the shapes `[batch_size x s]` for each s in `state_size`.
    """
    state_size = self.state_size
    if nest.is_sequence(state_size):
      state_size_flat = nest.flatten(state_size)
      zeros_flat = [
          array_ops.zeros(
              array_ops.pack(_state_size_with_prefix(s, prefix=[batch_size])),
              dtype=dtype)
          for s in state_size_flat]
      for s, z in zip(state_size_flat, zeros_flat):
        z.set_shape(_state_size_with_prefix(s, prefix=[None]))
      zeros = nest.pack_sequence_as(structure=state_size,
                                    flat_sequence=zeros_flat)
    else:
      zeros_size = _state_size_with_prefix(state_size, prefix=[batch_size])
      zeros = array_ops.zeros(array_ops.pack(zeros_size), dtype=dtype)
      zeros.set_shape(_state_size_with_prefix(state_size, prefix=[None]))

    return zeros
  def testMultiplyInverseAgainstExplicit(self):
    with ops.Graph().as_default(), self.test_session() as sess:
      random_seed.set_random_seed(200)
      params = array_ops.zeros((2, 2, 2, 2))
      inputs = array_ops.zeros((2, 2, 2, 2))
      outputs = array_ops.zeros((2, 2, 2, 2))
      block = fb.ConvKFCBasicFB(lc.LayerCollection(), params, (1, 1, 1, 1),
                                'SAME')
      block.register_additional_minibatch(inputs, outputs)
      grads = outputs**2
      damping = 0.  # This test is only valid without damping.
      block.instantiate_factors(([grads],), damping)

      sess.run(state_ops.assign(block._input_factor._cov, _make_psd(8)))
      sess.run(state_ops.assign(block._output_factor._cov, _make_psd(2)))
      sess.run(block._input_factor.make_inverse_update_ops())
      sess.run(block._output_factor.make_inverse_update_ops())

      v_flat = np.arange(16, dtype=np.float32)
      vector = utils.column_to_tensors(params, array_ops.constant(v_flat))
      output = block.multiply_inverse(vector)
      output_flat = sess.run(utils.tensors_to_column(output)).ravel()

      full = sess.run(block.full_fisher_block())
      explicit = np.dot(np.linalg.inv(full + damping * np.eye(16)), v_flat)

      self.assertAllClose(output_flat, explicit)
Example #22
0
  def testBasicLSTMCellWithStateTuple(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 2])
        c0 = array_ops.zeros([1, 2])
        h0 = array_ops.zeros([1, 2])
        state0 = core_rnn_cell_impl.LSTMStateTuple(c0, h0)
        c1 = array_ops.zeros([1, 2])
        h1 = array_ops.zeros([1, 2])
        state1 = core_rnn_cell_impl.LSTMStateTuple(c1, h1)
        cell = core_rnn_cell_impl.MultiRNNCell(
            [rnn_cell.LayerNormBasicLSTMCell(2) for _ in range(2)])
        h, (s0, s1) = cell(x, (state0, state1))
        sess.run([variables.global_variables_initializer()])
        res = sess.run([h, s0, s1], {
            x.name: np.array([[1., 1.]]),
            c0.name: 0.1 * np.asarray([[0, 1]]),
            h0.name: 0.1 * np.asarray([[2, 3]]),
            c1.name: 0.1 * np.asarray([[4, 5]]),
            h1.name: 0.1 * np.asarray([[6, 7]]),
        })

        expected_h = np.array([[-0.38079708, 0.38079708]])
        expected_h0 = np.array([[-0.38079708, 0.38079708]])
        expected_c0 = np.array([[-1.0, 1.0]])
        expected_h1 = np.array([[-0.38079708, 0.38079708]])
        expected_c1 = np.array([[-1.0, 1.0]])

        self.assertEqual(len(res), 3)
        self.assertAllClose(res[0], expected_h, 1e-5)
        self.assertAllClose(res[1].c, expected_c0, 1e-5)
        self.assertAllClose(res[1].h, expected_h0, 1e-5)
        self.assertAllClose(res[2].c, expected_c1, 1e-5)
        self.assertAllClose(res[2].h, expected_h1, 1e-5)
  def testBasicRNNCellNotTrainable(self):
    with self.test_session() as sess:

      def not_trainable_getter(getter, *args, **kwargs):
        kwargs["trainable"] = False
        return getter(*args, **kwargs)

      with variable_scope.variable_scope(
          "root",
          initializer=init_ops.constant_initializer(0.5),
          custom_getter=not_trainable_getter):
        x = array_ops.zeros([1, 2])
        m = array_ops.zeros([1, 2])
        cell = rnn_cell_impl.BasicRNNCell(2)
        g, _ = cell(x, m)
        self.assertFalse(cell.trainable_variables)
        self.assertEqual([
            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
        ], [v.name for v in cell.non_trainable_variables])
        sess.run([variables_lib.global_variables_initializer()])
        res = sess.run([g], {
            x.name: np.array([[1., 1.]]),
            m.name: np.array([[0.1, 0.1]])
        })
        self.assertEqual(res[0].shape, (1, 2))
 def testBasicLSTMCell(self):
   for dtype in [dtypes.float16, dtypes.float32]:
     np_dtype = dtype.as_numpy_dtype
     with self.test_session(graph=ops.Graph()) as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2], dtype=dtype)
         m = array_ops.zeros([1, 8], dtype=dtype)
         cell = rnn_cell_impl.MultiRNNCell(
             [
                 rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
                 for _ in range(2)
             ],
             state_is_tuple=False)
         self.assertEqual(cell.dtype, None)
         g, out_m = cell(x, m)
         # Layer infers the input type.
         self.assertEqual(cell.dtype, dtype.name)
         expected_variable_names = [
             "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
             rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
             "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
             rnn_cell_impl._BIAS_VARIABLE_NAME,
             "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
             rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
             "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
             rnn_cell_impl._BIAS_VARIABLE_NAME
         ]
         self.assertEqual(expected_variable_names,
                          [v.name for v in cell.trainable_variables])
         self.assertFalse(cell.non_trainable_variables)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run([g, out_m], {
             x.name: np.array([[1., 1.]]),
             m.name: 0.1 * np.ones([1, 8])
         })
         self.assertEqual(len(res), 2)
         variables = variables_lib.global_variables()
         self.assertEqual(expected_variable_names, [v.name for v in variables])
         # The numbers in results were not calculated, this is just a
         # smoke test.
         self.assertAllClose(res[0], np.array(
             [[0.240, 0.240]], dtype=np_dtype), 1e-2)
         expected_mem = np.array(
             [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
             dtype=np_dtype)
         self.assertAllClose(res[1], expected_mem, 1e-2)
       with variable_scope.variable_scope(
           "other", initializer=init_ops.constant_initializer(0.5)):
         # Test BasicLSTMCell with input_size != num_units.
         x = array_ops.zeros([1, 3], dtype=dtype)
         m = array_ops.zeros([1, 4], dtype=dtype)
         g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g, out_m], {
                 x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
                 m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)
             })
         self.assertEqual(len(res), 2)
  def testResidualWrapperWithSlice(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 5])
        m = array_ops.zeros([1, 3])
        base_cell = rnn_cell_impl.GRUCell(3)
        g, m_new = base_cell(x, m)
        variable_scope.get_variable_scope().reuse_variables()

        def residual_with_slice_fn(inp, out):
          inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
          return inp_sliced + out

        g_res, m_new_res = rnn_cell_impl.ResidualWrapper(
            base_cell, residual_with_slice_fn)(x, m)
        sess.run([variables_lib.global_variables_initializer()])
        res_g, res_g_res, res_m_new, res_m_new_res = sess.run(
            [g, g_res, m_new, m_new_res], {
                x: np.array([[1., 1., 1., 1., 1.]]),
                m: np.array([[0.1, 0.1, 0.1]])
            })
        # Residual connections
        self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
        # States are left untouched
        self.assertAllClose(res_m_new, res_m_new_res)
  def testClusterSpecPropagationThreeServers2Graphs(self):
    """Boots 3 servers, creates 2 sessions, ensures appropriate operations.

    We create 2 clusterspecs:
     1. server2 as the master, server1 as a worker
     2. server2 as the master, server3 as a worker

    We ensure that variables on the workers are independent.
    """
    server1 = server_lib.Server.create_local_server()
    server2 = server_lib.Server.create_local_server()
    server3 = server_lib.Server.create_local_server()
    cluster_def1 = cluster_pb2.ClusterDef()
    job1 = cluster_def1.job.add()
    job1.name = 'worker1'
    job1.tasks[0] = server2.target[len('grpc://'):]
    job1.tasks[1] = server1.target[len('grpc://'):]

    cluster_def2 = cluster_pb2.ClusterDef()
    job2 = cluster_def2.job.add()
    job2.name = 'worker2'
    job2.tasks[0] = server2.target[len('grpc://'):]
    job2.tasks[1] = server3.target[len('grpc://'):]

    config1 = config_pb2.ConfigProto(cluster_def=cluster_def1)
    config2 = config_pb2.ConfigProto(cluster_def=cluster_def2)

    with ops.Graph().as_default() as g1:
      with ops.device('/job:worker1/task:1'):
        var1 = variables.Variable(array_ops.zeros([2]), name='var1')
        update_op1 = state_ops.assign_add(
            var1, array_ops.ones([2]), name='var1_assign_add')
        init1 = variables.global_variables_initializer()

    with ops.Graph().as_default() as g2:
      with ops.device('/job:worker2/task:1'):
        var2 = variables.Variable(array_ops.zeros([2]), name='var2')
        update_op2 = state_ops.assign_add(
            var2, array_ops.ones([2]), name='var2_assign_add')
        init2 = variables.global_variables_initializer()

    sess1 = session.Session(server2.target, graph=g1, config=config1)
    sess2 = session.Session(server2.target, graph=g2, config=config2)

    init1.run(session=sess1)
    init2.run(session=sess2)

    expected_zeros = np.zeros([2])
    expected_ones = np.ones([2])

    self.assertAllEqual(expected_zeros, sess1.run(var1))
    self.assertAllEqual(expected_zeros, sess2.run(var2))

    self.assertAllEqual(expected_ones, sess1.run(update_op1))
    self.assertAllEqual(expected_ones, sess1.run(var1))
    self.assertAllEqual(expected_zeros, sess2.run(var2))
    self.assertAllEqual(expected_ones, sess2.run(update_op2))
    self.assertAllEqual(expected_ones + expected_ones, sess1.run(update_op1))
    self.assertAllEqual(expected_ones, sess2.run(var2))
    self.assertAllEqual(expected_ones + expected_ones, sess1.run(var1))
 def _test_logits_helper(self, mode):
   """Tests that the expected logits are passed to mock head."""
   with ops.Graph().as_default():
     training_util.get_or_create_global_step()
     generator_inputs = {'x': array_ops.zeros([5, 4])}
     real_data = (None if mode == model_fn_lib.ModeKeys.PREDICT else
                  array_ops.zeros([5, 4]))
     generator_scope_name = 'generator'
     head = mock_head(self,
                      expected_generator_inputs=generator_inputs,
                      expected_real_data=real_data,
                      generator_scope_name=generator_scope_name)
     estimator_spec = estimator._gan_model_fn(
         features=generator_inputs,
         labels=real_data,
         mode=mode,
         generator_fn=generator_fn,
         discriminator_fn=discriminator_fn,
         generator_scope_name=generator_scope_name,
         head=head)
     with monitored_session.MonitoredTrainingSession(
         checkpoint_dir=self._model_dir) as sess:
       if mode == model_fn_lib.ModeKeys.TRAIN:
         sess.run(estimator_spec.train_op)
       elif mode == model_fn_lib.ModeKeys.EVAL:
         sess.run(estimator_spec.loss)
       elif mode == model_fn_lib.ModeKeys.PREDICT:
         sess.run(estimator_spec.predictions)
       else:
         self.fail('Invalid mode: {}'.format(mode))
Example #28
0
 def get_start_state(self):
   # State which matches the format we'll return later. Typically this will not
   # be used by the model directly, but the shapes and dtypes should match so
   # that the serving input_receiver_fn gets placeholder shapes correct.
   return (array_ops.zeros([self.input_window_size], dtype=dtypes.int64),
           array_ops.zeros(
               [self.input_window_size, self.num_features], dtype=self.dtype))
  def _matmul(self, x, adjoint=False, adjoint_arg=False):
    if self._assert_proper_shapes:
      x = linalg.adjoint(x) if adjoint_arg else x
      aps = linear_operator_util.assert_compatible_matrix_dimensions(self, x)
      x = control_flow_ops.with_dependencies([aps], x)
    if self.is_square:
      # Note that adjoint has no effect since this matrix is self-adjoint.
      if adjoint_arg:
        output_shape = array_ops.concat([
            array_ops.shape(x)[:-2],
            [array_ops.shape(x)[-1], array_ops.shape(x)[-2]]], axis=0)
      else:
        output_shape = array_ops.shape(x)

      return self._possibly_broadcast_batch_shape(
          array_ops.zeros(shape=output_shape, dtype=x.dtype))

    x_shape = array_ops.shape(x)
    n = self._num_columns if adjoint else self._num_rows
    m = x_shape[-2] if adjoint_arg else x_shape[-1]

    output_shape = array_ops.concat([x_shape[:-2], [n, m]], axis=0)

    zeros = array_ops.zeros(shape=output_shape, dtype=x.dtype)
    return self._possibly_broadcast_batch_shape(zeros)
Example #30
0
  def testBlockGRUToGRUCellSingleStep(self):
    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
      batch_size = 4
      cell_size = 5
      input_size = 6

      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)

      # Inputs
      x = array_ops.zeros([batch_size, input_size])
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_value = np.random.rand(batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        output = rnn_cell.GRUCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])
        basic_res = sess.run([output], {x: x_value, h: h_value})

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        output = gru_ops.GRUBlockCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])
        block_res = sess.run([output], {x: x_value, h: h_value})

      self.assertEqual(len(block_res), len(basic_res))
      for block, basic in zip(block_res, basic_res):
        self.assertAllClose(block, basic)
def attention_decoder(decoder_inputs,
                      initial_state,
                      encoder_states,
                      rel_scores,
                      cell,
                      initial_state_attention=False,
                      pointer_gen=True,
                      use_coverage=False,
                      prev_coverage=None):
    """
  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    encoder_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    initial_state_attention:
      Note that this attention decoder passes each decoder input through a linear layer with the previous step's context vector to get a modified version of the input. If initial_state_attention is False, on the first decoder step the "previous context vector" is just a zero vector. If initial_state_attention is True, we use initial_state to (re)calculate the previous step's context vector. We set this to False for train/eval mode (because we call attention_decoder once for all decoder steps) and True for decode mode (because we call attention_decoder once for each decoder step).
    pointer_gen: boolean. If True, calculate the generation probability p_gen for each decoder step.
    use_coverage: boolean. If True, use coverage mechanism.
    prev_coverage:
      If not None, a tensor with shape (batch_size, attn_length). The previous step's coverage vector. This is only not None in decode mode when using coverage.

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of
      shape [batch_size x cell.output_size]. The output vectors.
    state: The final state of the decoder. A tensor shape [batch_size x cell.state_size].
    attn_dists: A list containing tensors of shape (batch_size,attn_length).
      The attention distributions for each decoder step.
    p_gens: List of scalars. The values of p_gen for each decoder step. Empty list if pointer_gen=False.
    coverage: Coverage vector on the last step computed. None if use_coverage=False.
  """
    with variable_scope.variable_scope("attention_decoder") as scope:
        batch_size = encoder_states.get_shape(
        )[0].value  # if this line fails, it's because the batch size isn't defined
        attn_size = encoder_states.get_shape(
        )[2].value  # if this line fails, it's because the attention length isn't defined

        # Reshape encoder_states (need to insert a dim)
        encoder_states = tf.expand_dims(
            encoder_states,
            axis=2)  # now is shape (batch_size, attn_len, 1, attn_size)

        # To calculate attention, we calculate
        #   v^T tanh(W_h h_i + W_s s_t + b_attn)
        # where h_i is an encoder state, and s_t a decoder state.
        # attn_vec_size is the length of the vectors v, b_attn, (W_h h_i) and (W_s s_t).
        # We set it to be equal to the size of the encoder states.
        attention_vec_size = attn_size

        # Get the weight matrix W_h and apply it to each encoder state to get (W_h h_i), the encoder features
        W_h = variable_scope.get_variable(
            "W_h", [1, 1, attn_size, attention_vec_size])
        encoder_features = nn_ops.conv2d(
            encoder_states, W_h, [1, 1, 1, 1],
            "SAME")  # shape (batch_size,attn_length,1,attention_vec_size)

        # Get the weight vectors v and w_c (w_c is for coverage)
        v = variable_scope.get_variable("v", [attention_vec_size])
        if use_coverage:
            with variable_scope.variable_scope("coverage"):
                w_c = variable_scope.get_variable(
                    "w_c", [1, 1, 1, attention_vec_size])

        if prev_coverage is not None:  # for beam search mode with coverage
            # reshape from (batch_size, attn_length) to (batch_size, attn_len, 1, 1)
            prev_coverage = tf.expand_dims(tf.expand_dims(prev_coverage, 2), 3)

        def attention(decoder_state, coverage=None):
            """Calculate the context vector and attention distribution from the decoder state.

      Args:
        decoder_state: state of the decoder
        coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1).

      Returns:
        context_vector: weighted sum of encoder_states
        attn_dist: attention distribution
        coverage: new coverage vector. shape (batch_size, attn_len, 1, 1)
      """
            with variable_scope.variable_scope("Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                decoder_features = linear(
                    decoder_state, attention_vec_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, attention_vec_size)

                if use_coverage and coverage is not None:  # non-first step of coverage
                    # Multiply coverage vector by w_c to get coverage_features.
                    coverage_features = nn_ops.conv2d(
                        coverage, w_c, [1, 1, 1, 1], "SAME"
                    )  # c has shape (batch_size, attn_length, 1, attention_vec_size)

                    # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
                    e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features +
                                          coverage_features),
                        [2, 3])  # shape (batch_size,attn_length)
                    #e = tf.multiply(e, rel_scores)
                    # Take softmax of e to get the attention distribution
                    attn_dist = nn_ops.softmax(
                        e)  # shape (batch_size, attn_length)

                    # Update coverage vector
                    coverage += array_ops.reshape(attn_dist,
                                                  [batch_size, -1, 1, 1])
                else:
                    # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                    e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features),
                        [2, 3])  # calculate e
                    #e = tf.multiply(e, rel_scores)
                    # Take softmax of e to get the attention distribution
                    attn_dist = nn_ops.softmax(
                        e)  # shape (batch_size, attn_length)

                    if use_coverage:  # first step of training
                        coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2),
                                                  2)  # initialize coverage

                # Calculate the context vector from attn_dist and encoder_states
                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) *
                    encoder_states, [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector,
                                                   [-1, attn_size])

            return context_vector, attn_dist, coverage

        outputs = []
        attn_dists = []
        p_gens = []
        state = initial_state
        coverage = prev_coverage  # initialize coverage to None or whatever was passed in
        context_vector = array_ops.zeros([batch_size, attn_size])
        context_vector.set_shape([
            None, attn_size
        ])  # Ensure the second shape of attention vectors is set.
        if initial_state_attention:  # true in decode mode
            # Re-calculate the context vector from the previous step so that we can pass it through a linear layer with this step's input to get a modified version of the input
            context_vector, _, coverage = attention(
                initial_state, coverage
            )  # in decode mode, this is what updates the coverage vector
        for i, inp in enumerate(decoder_inputs):
            tf.logging.info("Adding attention_decoder timestep %i of %i", i,
                            len(decoder_inputs))
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()

            # Merge input and previous attentions into one vector x of the same size as inp
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" %
                                 inp.name)
            x = linear([inp] + [context_vector], input_size, True)

            # Run the decoder RNN cell. cell_output = decoder state
            cell_output, state = cell(x, state)

            # Run the attention mechanism.
            if i == 0 and initial_state_attention:  # always true in decode mode
                with variable_scope.variable_scope(
                        variable_scope.get_variable_scope(), reuse=True
                ):  # you need this because you've already run the initial attention(...) call
                    context_vector, attn_dist, _ = attention(
                        state, coverage)  # don't allow coverage to update
            else:
                context_vector, attn_dist, coverage = attention(
                    state, coverage)
            attn_dists.append(attn_dist)

            # Calculate p_gen
            if pointer_gen:
                with tf.variable_scope('calculate_pgen'):
                    p_gen = linear([context_vector, state.c, state.h, x], 1,
                                   True)  # a scalar
                    p_gen = tf.sigmoid(p_gen)
                    p_gens.append(p_gen)

            # Concatenate the cell_output (= decoder state) and the context vector, and pass them through a linear layer
            # This is V[s_t, h*_t] + b in the paper
            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([cell_output] + [context_vector],
                                cell.output_size, True)
            outputs.append(output)

        # If using coverage, reshape it
        if coverage is not None:
            coverage = array_ops.reshape(coverage, [batch_size, -1])

        return outputs, state, attn_dists, p_gens, coverage
Example #32
0
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
  """Fit loop for training with TPU tf.distribute.Strategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.abc.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  mode = ModeKeys.TRAIN

  current_strategy = model._distribution_strategy
  iteration_value = min(steps_per_epoch,
                        current_strategy.extended.steps_per_run)
  steps_per_run = K.variable(
      value=iteration_value,
      dtype='int32',
      name='steps_per_run')

  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
  iterator = dist_utils.get_iterator(dataset, current_strategy)

  scope = dist_utils.distributed_scope(
      strategy=current_strategy, learning_phase=1)
  scope.__enter__()

  out_labels = model.metrics_names or []

  step_fn = _make_train_step_fn(model, ModeKeys.TRAIN, current_strategy,
                                out_labels)

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for m in model._get_training_eval_metrics():
    tensor = m.result()
    initial_loop_values[m.name] = array_ops.zeros(tensor.shape, tensor.dtype)

  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=steps_per_run,
      initial_loop_values=initial_loop_values)
  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    dist_utils._copy_weights_to_distributed_model(model, mode)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose,
      count_mode='steps',
      mode=mode)

  # Calculate the steps each time on the device.
  steps_to_run = ([current_strategy.extended.steps_per_run] *
                  (steps_per_epoch //
                   current_strategy.extended.steps_per_run))
  if steps_per_epoch % current_strategy.extended.steps_per_run:
    steps_to_run.append(
        steps_per_epoch % current_strategy.extended.steps_per_run)
  target_steps = len(steps_to_run)

  callbacks._call_begin_hook(mode)

  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)

  for epoch in range(initial_epoch, epochs):
    dist_utils._reset_metrics(model)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    current_step = 0
    while current_step < target_steps:
      step_count = steps_to_run[current_step]
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        K.get_session().run(steps_per_run.assign(step_count))
        prev_step_count = step_count
      try:
        _, outputs = K.batch_get_value([train_op, output_tensors])
      except errors.OutOfRangeError:
        logging.warning('Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your dataset '
                        'can generate at least `steps_per_epoch * epochs` '
                        'batches (in this case, %d batches).' %
                        steps_per_epoch * epochs)
        break

      batch_logs.update(outputs)
      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
      step_index = step_index + step_count
      current_step += 1

      if callbacks.model.stop_training:
        break

    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch)):
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN)

      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
          model,
          val_dataset,
          steps=validation_steps,
          verbose=verbose,
          callbacks=callbacks)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  model._successful_loop_finish = True
  callbacks._call_end_hook(mode)

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history
Example #33
0
 def _prepare_local(self, var_device, var_dtype, apply_state):
     super(Adagrad, self)._prepare_local(var_device, var_dtype, apply_state)
     apply_state[(var_device, var_dtype)].update(
         dict(epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
              neg_lr_t=-apply_state[(var_device, var_dtype)]['lr_t'],
              zero=array_ops.zeros((), dtype=dtypes.int64)))
Example #34
0
 def __call__(self, x):
     current_sum = array_ops.zeros([], dtype=dtypes.int64)
     for element in self.dataset:
         current_sum += x * element
     return current_sum
Example #35
0
 def func(x, dtype=None):
     if dtype:
         return array_ops.zeros(shape=x.shape, dtype=dtype)
     else:
         return array_ops.zeros(shape=x.shape, dtype=dtypes.float32)
Example #36
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """Apply gradients to variables.

    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.

    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
        if not grads_and_vars:
            raise ValueError("Must supply at least one variable")

        if global_step is None:
            raise ValueError("Global step is required to check staleness")

        self._global_step = global_step
        train_ops = []
        aggregated_grad = []
        inputs = []
        var_list = []
        for x in grads_and_vars:
            inputs.extend(list(x))

        with ops.device(global_step.device):
            self._local_steps = variables.Variable(array_ops.zeros(
                [self._total_num_replicas], dtype=global_step.dtype),
                                                   trainable=False,
                                                   name="local_steps")

        # Check staleness. Note that this has to be ref(), otherwise identity will
        # be accessed and it will be old values.
        local_step = array_ops.slice(self._local_steps.ref(),
                                     array_ops.reshape(self._replica_id,
                                                       (1, )), [1],
                                     name="get_local_step")
        local_step = array_ops.reshape(local_step, ())
        is_stale = math_ops.less(local_step, global_step)

        with ops.op_scope(inputs, None, self._name):
            for grad, var in grads_and_vars:
                var_list.append(var)
                with ops.device(var.device):
                    if isinstance(grad, ops.Tensor):
                        gradient_queue = (data_flow_ops.FIFOQueue(
                            self._tokens_per_step * 2,
                            grad.dtype,
                            shapes=var.get_shape(),
                            shared_name=var.name))
                        self._one_element_queue_list.append(
                            (gradient_queue, var.device))
                        train_ops.append(gradient_queue.enqueue([grad]))

                        # Aggregate all gradients
                        gradients = gradient_queue.dequeue_many(
                            self._replicas_to_aggregate)
                        aggregated_grad.append(
                            math_ops.reduce_sum(gradients, [0]))
                    elif grad is None:
                        aggregated_grad.append(None)  # pass-through.
                    else:
                        if not isinstance(grad, ops.IndexedSlices):
                            raise ValueError("Unknown grad type!")
                        aggregated_grad.append(
                            self._aggregate_sparse_grad(grad, var, train_ops))

            aggregated_grads_and_vars = zip(aggregated_grad, var_list)

            # sync_op will be assigned to the same device as the global step.
            with ops.device(global_step.device), ops.name_scope(""):
                update_op = self._opt.apply_gradients(
                    aggregated_grads_and_vars, global_step)

            # Create token queue.
            with ops.device(global_step.device), ops.name_scope(""):
                sync_token_queue = (data_flow_ops.FIFOQueue(
                    -1,
                    global_step.dtype.base_dtype,
                    shapes=(),
                    shared_name="sync_token_q"))
                self._sync_token_queue = sync_token_queue

                # dummy_queue is passed to the queue runner. Don't use the real queues
                # because the queue runner doesn't automatically reopen it once it
                # closed queues in PS devices.
                dummy_queue = (data_flow_ops.FIFOQueue(
                    1,
                    types_pb2.DT_INT32,
                    shapes=(),
                    shared_name="dummy_queue"))
            # Clear all the gradients queues in case there are stale gradients.
            clear_queue_ops = []
            with ops.control_dependencies([update_op]):
                for queue, dev in self._one_element_queue_list:
                    with ops.device(dev):
                        stale_grads = queue.dequeue_many(queue.size())
                        clear_queue_ops.append(stale_grads)

                for queue, dev in self._sparse_grad_queues_and_devs:
                    with ops.device(dev):
                        _, stale_indices = queue.dequeue_many(queue.size())
                        clear_queue_ops.append(stale_indices)

            with ops.device(global_step.device):
                self._clean_up_op = control_flow_ops.abort(
                    error_msg="From sync_replicas")

            # According to the staleness, select between the enqueue op (real_grad)
            # or no-op (no_op_grad). Effectively dropping all the stale gradients.
            no_op_grad = lambda: [
                control_flow_ops.no_op(name="no_grad_enqueue")
            ]
            real_grad = lambda: [control_flow_ops.group(*train_ops)]
            final_train_ops = control_flow_ops.cond(is_stale, no_op_grad,
                                                    real_grad)

            with ops.device(global_step.device), ops.name_scope(""):
                # Replicas have to wait until they can get a token from the token queue.
                with ops.control_dependencies([final_train_ops]):
                    token = sync_token_queue.dequeue()
                    train_op = state_ops.scatter_update(
                        self._local_steps, self._replica_id, token)

                with ops.control_dependencies(clear_queue_ops):
                    # Sync_op needs to insert tokens to the token queue at the end of the
                    # step so the replicas can fetch them to start the next step.
                    # Note that ref() is used to avoid reading from the identity with old
                    # the step.
                    tokens = array_ops.fill([self._tokens_per_step],
                                            global_step.ref())
                    sync_op = sync_token_queue.enqueue_many((tokens, ))

                if self._variable_averages is not None:
                    with ops.control_dependencies([sync_op
                                                   ]), ops.name_scope(""):
                        sync_op = self._variable_averages.apply(
                            self._variables_to_average)

                self._chief_queue_runner = queue_runner.QueueRunner(
                    dummy_queue, [sync_op])
                self._gradients_applied = True
                return train_op
Example #37
0
 def testClipByNormGradientZeros(self):
     with self.session(use_gpu=True):
         x = array_ops.zeros([3])
         b = clip_ops.clip_by_norm(x, 1.)
         grad, = gradients_impl.gradients(b, x)
         self.assertAllEqual(grad.eval(), [1., 1., 1.])
Example #38
0
def _SignGrad(op, _):
    """Returns 0."""
    x = op.inputs[0]
    return array_ops.zeros(array_ops.shape(x), dtype=x.dtype)
    def __init__(self,
                 loc,
                 scale,
                 skewness=None,
                 tailweight=None,
                 distribution=None,
                 validate_args=False,
                 allow_nan_stats=True,
                 name="SinhArcsinh"):
        """Construct SinhArcsinh distribution on `(-inf, inf)`.

    Arguments `(loc, scale, skewness, tailweight)` must have broadcastable shape
    (indexing batch dimensions).  They must all have the same `dtype`.

    Args:
      loc: Floating-point `Tensor`.
      scale:  `Tensor` of same `dtype` as `loc`.
      skewness:  Skewness parameter.  Default is `0.0` (no skew).
      tailweight:  Tailweight parameter. Default is `1.0` (unchanged tailweight)
      distribution: `tf.Distribution`-like instance. Distribution that is
        transformed to produce this distribution.
        Default is `tfp.distributions.Normal(0., 1.)`.
        Must be a scalar-batch, scalar-event distribution.  Typically
        `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is
        a function of non-trainable parameters. WARNING: If you backprop through
        a `SinhArcsinh` sample and `distribution` is not
        `FULLY_REPARAMETERIZED` yet is a function of trainable variables, then
        the gradient will be incorrect!
      validate_args: Python `bool`, default `False`. When `True` distribution
        parameters are checked for validity despite possibly degrading runtime
        performance. When `False` invalid inputs may silently render incorrect
        outputs.
      allow_nan_stats: Python `bool`, default `True`. When `True`,
        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
        indicate the result is undefined. When `False`, an exception is raised
        if one or more of the statistic's batch members are undefined.
      name: Python `str` name prefixed to Ops created by this class.
    """
        parameters = dict(locals())

        with ops.name_scope(name, values=[loc, scale, skewness,
                                          tailweight]) as name:
            loc = ops.convert_to_tensor(loc, name="loc")
            dtype = loc.dtype
            scale = ops.convert_to_tensor(scale, name="scale", dtype=dtype)
            tailweight = 1. if tailweight is None else tailweight
            has_default_skewness = skewness is None
            skewness = 0. if skewness is None else skewness
            tailweight = ops.convert_to_tensor(tailweight,
                                               name="tailweight",
                                               dtype=dtype)
            skewness = ops.convert_to_tensor(skewness,
                                             name="skewness",
                                             dtype=dtype)

            batch_shape = distribution_util.get_broadcast_shape(
                loc, scale, tailweight, skewness)

            # Recall, with Z a random variable,
            #   Y := loc + C * F(Z),
            #   F(Z) := Sinh( (Arcsinh(Z) + skewness) * tailweight )
            #   F_0(Z) := Sinh( Arcsinh(Z) * tailweight )
            #   C := 2 * scale / F_0(2)
            if distribution is None:
                distribution = normal.Normal(loc=array_ops.zeros([],
                                                                 dtype=dtype),
                                             scale=array_ops.ones([],
                                                                  dtype=dtype),
                                             allow_nan_stats=allow_nan_stats)
            else:
                asserts = distribution_util.maybe_check_scalar_distribution(
                    distribution, dtype, validate_args)
                if asserts:
                    loc = control_flow_ops.with_dependencies(asserts, loc)

            # Make the SAS bijector, 'F'.
            f = bijectors.SinhArcsinh(skewness=skewness, tailweight=tailweight)
            if has_default_skewness:
                f_noskew = f
            else:
                f_noskew = bijectors.SinhArcsinh(
                    skewness=skewness.dtype.as_numpy_dtype(0.),
                    tailweight=tailweight)

            # Make the AffineScalar bijector, Z --> loc + scale * Z (2 / F_0(2))
            c = 2 * scale / f_noskew.forward(
                ops.convert_to_tensor(2, dtype=dtype))
            affine = bijectors.AffineScalar(shift=loc,
                                            scale=c,
                                            validate_args=validate_args)

            bijector = bijectors.Chain([affine, f])

            super(SinhArcsinh, self).__init__(distribution=distribution,
                                              bijector=bijector,
                                              batch_shape=batch_shape,
                                              validate_args=validate_args,
                                              name=name)
        self._parameters = parameters
        self._loc = loc
        self._scale = scale
        self._tailweight = tailweight
        self._skewness = skewness
  def _TestFoldDepthwiseConv2d(self, relu, relu_op_name, with_bypass,
                               has_scaling, fused_batch_norm):
    """Tests folding: inputs -> DepthwiseConv2d with batch norm -> Relu*.

    Args:
      relu: Callable that returns an Operation, a factory method for the Relu*.
      relu_op_name: String, name of the Relu* operation.
      with_bypass: Bool, when true there is an extra connection added from
        inputs to just before Relu*.
      has_scaling: Bool, when true the batch norm has scaling.
      fused_batch_norm: Bool, when true the batch norm is fused.
    """
    g = ops.Graph()
    with g.as_default():
      batch_size, height, width = 5, 128, 128
      inputs = array_ops.zeros((batch_size, height, width, 3))
      stride = 1 if with_bypass else 2
      activation_fn = None if with_bypass else relu
      scope = 'test/test2' if with_bypass else 'test'
      node = separable_conv2d(
          inputs,
          None, [5, 5],
          stride=stride,
          depth_multiplier=1.0,
          padding='SAME',
          weights_initializer=self._WeightInit(0.09),
          activation_fn=activation_fn,
          normalizer_fn=batch_norm,
          normalizer_params=self._BatchNormParams(
              scale=has_scaling, fused=fused_batch_norm),
          scope=scope)
      if with_bypass:
        node = math_ops.add(inputs, node, name='test/Add')
        relu(node, name='test/' + relu_op_name)

      fold_batch_norms.FoldBatchNorms(g)

    folded_mul = g.get_operation_by_name(scope + '/mul_fold')
    self.assertEqual(folded_mul.type, 'Mul')
    if fused_batch_norm:
      scale_reshape_op_name = scope + '/BatchNorm_Fold/scale_reshape'
    else:
      scale_reshape_op_name = scope + '/scale_reshape'
    self._AssertInputOpsAre(folded_mul,
                            [scope + '/depthwise_weights/read',
                             scale_reshape_op_name])
    self._AssertOutputGoesToOps(folded_mul, g, [scope + '/depthwise_Fold'])

    scale_reshape = g.get_operation_by_name(scale_reshape_op_name)
    self.assertEqual(scale_reshape.type, 'Reshape')
    self._AssertInputOpsAre(scale_reshape, [
        self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm),
        scale_reshape_op_name + '/shape'
    ])
    self._AssertOutputGoesToOps(scale_reshape, g, [scope + '/mul_fold'])

    folded_conv = g.get_operation_by_name(scope + '/depthwise_Fold')
    self.assertEqual(folded_conv.type, 'DepthwiseConv2dNative')
    self._AssertInputOpsAre(folded_conv,
                            [scope + '/mul_fold', inputs.op.name])
    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])

    folded_add = g.get_operation_by_name(scope + '/add_fold')
    self.assertEqual(folded_add.type, 'Add')
    self._AssertInputOpsAre(folded_add, [
        scope + '/depthwise_Fold',
        self._BathNormBiasName(scope, fused_batch_norm)
    ])
    output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
    self._AssertOutputGoesToOps(folded_add, g, output_op_names)
 def testSmokeScatterNdBatch2DSliceDim3ShapeRank7(self):
     with self.test_session():
         indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32)
         values = array_ops.zeros([1, 2, 6, 7, 8, 9])
         shape = [3, 4, 5, 6, 7, 8, 9]
         self.scatter_nd(indices, values, shape).eval()
Example #42
0
 def _ResizeBilinearGrad(op, grads):
     return (array_ops.zeros(shape=array_ops.shape(op.inputs[0]),
                             dtype=op.inputs[0].dtype),
             tf.raw_ops.ResizeBilinearGrad(grads=grads,
                                           original_image=op.inputs[1]))
Example #43
0
    def testLSTMBasicToBlockCellPeeping(self):
        with self.test_session(use_gpu=True) as sess:
            x = array_ops.zeros([1, 2])
            x_values = np.random.randn(1, 2)

            m0_val = 0.1 * np.ones([1, 2])
            m1_val = -0.1 * np.ones([1, 2])
            m2_val = -0.2 * np.ones([1, 2])
            m3_val = 0.2 * np.ones([1, 2])

            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=19890212)
            with variable_scope.variable_scope("basic",
                                               initializer=initializer):
                m0 = array_ops.zeros([1, 2])
                m1 = array_ops.zeros([1, 2])
                m2 = array_ops.zeros([1, 2])
                m3 = array_ops.zeros([1, 2])
                g, ((out_m0, out_m1),
                    (out_m2, out_m3)) = rnn_cell.MultiRNNCell(
                        [
                            rnn_cell.LSTMCell(
                                2, use_peepholes=True, state_is_tuple=True)
                            for _ in range(2)
                        ],
                        state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
                sess.run([variables.global_variables_initializer()])
                basic_res = sess.run(
                    [g, out_m0, out_m1, out_m2, out_m3], {
                        x.name: x_values,
                        m0.name: m0_val,
                        m1.name: m1_val,
                        m2.name: m2_val,
                        m3.name: m3_val
                    })

            with variable_scope.variable_scope("block",
                                               initializer=initializer):
                m0 = array_ops.zeros([1, 2])
                m1 = array_ops.zeros([1, 2])
                m2 = array_ops.zeros([1, 2])
                m3 = array_ops.zeros([1, 2])
                g, ((out_m0, out_m1),
                    (out_m2, out_m3)) = rnn_cell.MultiRNNCell(
                        [
                            lstm_ops.LSTMBlockCell(2, use_peephole=True)
                            for _ in range(2)
                        ],
                        state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
                sess.run([variables.global_variables_initializer()])
                block_res = sess.run(
                    [g, out_m0, out_m1, out_m2, out_m3], {
                        x.name: x_values,
                        m0.name: m0_val,
                        m1.name: m1_val,
                        m2.name: m2_val,
                        m3.name: m3_val
                    })

            self.assertEqual(len(basic_res), len(block_res))
            for basic, block in zip(basic_res, block_res):
                self.assertAllClose(basic, block)
 def scatter_nd(self, indices, updates, shape, input_=None):
     input_ = (input_ if input_ is not None else array_ops.zeros(
         shape, dtype=updates.dtype))
     return array_ops.scatter_nd_non_aliasing_add(input_, indices, updates)
 def testZeroLengthDim(self):
     x = array_ops.zeros(shape=(0, 1, 2))
     y = self.evaluate(array_ops.unstack(x, axis=1)[0])
     self.assertEqual(y.shape, (0, 2))
 def testSmokeScatterNdBatch1DSliceDim2(self):
     with self.test_session():
         indices = array_ops.zeros([0, 2], dtype=dtypes.int32)
         values = array_ops.zeros([0, 7])
         shape = [4, 6, 7]
         self.scatter_nd(indices, values, shape).eval()
def attention_decoder(decoder_inputs, initial_state, encoder_states, enc_padding_mask, cell,
                      initial_state_attention=False):
    with variable_scope.variable_scope("attention_decoder") as scope:
        # if this line fails, it's because the batch size isn't defined
        #对encoder_state 进行split分成对应每个batch_size 下的每一行的
        batch_size = encoder_states.get_shape()[0].value

        # if this line fails, it's because the attention length isn't defined
        
        attn_size = encoder_states.get_shape()[2].value
        print(attn_size)
        #这里不清数为什么做expend_dim
        # shape (batch_size, attn_len, 1, attn_size)
        encoder_states = tf.expand_dims(encoder_states, axis=2)
        attention_vec_size = attn_size
        #此处是需要测试出来W_h的具体形状和数值
        #W_h(1,1,400,400)
        W_h = variable_scope.get_variable("W_h", [1, 1, attn_size, attention_vec_size])

        # shape (batch_size,attn_length,1,attention_vec_size)
        #此处自己做了实验百分之百不是仅仅的做维度变化,做了数值变化
        #代码见:
        #不明白为什么要做卷积变换
        #此处对encoder_state进行变形,此处变形用的是卷积,用encoder_state对一个W_h [shape,shape],做卷积变形。此处数值应该出现变化,而不是单单的矩阵变形。
        encoder_features = nn_ops.conv2d(encoder_states, W_h, [1, 1, 1, 1], "SAME")
        
        #此处初始化赋值的疑惑:tf.layers.conv2d中的权重初始化参数默认为None,但是就算不给参数也可以正常训练,
        #经过查看源码发现tf.layers.conv2d是从tf.keras.layer.Conv继承来的,在父类中对初始化进行了定义,
        #kernel_initializer='glorot_uniform'对卷积核参数进行均匀初始化

        #f(x)*g(t-x)在此处做卷积变换,疑惑是声明变量的数值是多少
        # Get the weight vectors v and w_c (w_c is for coverage)
        #此处的V是直接定义的一个矩阵。但是没有进行权重初始化
        #标准的写法不是这种,是利用生成一个神经网络层进行初始化,但是此处应该雇佣纠结初始化数值问题,用的nn_ops库中的矩阵在initializer的时候能够自动初始化。相当于同tf例子中的作用一样,不一样的函数表达式
        v = variable_scope.get_variable("v", [attention_vec_size])
        #此处是使用一个BahdanauAttention算法来分配注意力
        def attention(decoder_state):
            with variable_scope.variable_scope("attention"):

                # Pass the decoder state through a linear layer
                # shape (batch_size, attention_vec_size)
                #linear的作用做a*decode_state+bias 这里面感觉自己的理解是错误的?
                #decoder_features=[32,200]
                #decoder_state=decoder_state
                #linear(args, output_size, bias, bias_start=0.0, scope=None):
                #此处没搞明白参数的对应关系。
                #
                decoder_state
                attention_vec_size
                print(decoder_state)
                #此处linear没做审什么变动知识做了list tuple检查,如果decoder_state形式正确,decoder没任何变化
                decoder_features = linear(decoder_state, attention_vec_size, True)

                # reshape to (batch_size, 1, 1, attention_vec_size)
                #此处一直没明白为什么要reshape,此处运行是要大量时间,是为了?
                decoder_features = tf.expand_dims(tf.expand_dims(decoder_features, 1), 1)

                def masked_attention(e):
                    """Take softmax of e then apply enc_padding_mask and re-normalize"""
                    attn_dist = nn_ops.softmax(e)  # take softmax. shape (batch_size, attn_length)
                    
                    attn_dist *= enc_padding_mask  # apply mask
                    masked_sums = tf.reduce_sum(attn_dist, axis=1)  # shape (batch_size)
                    return attn_dist / tf.reshape(masked_sums, [-1, 1])  # re-normalize


                # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)求出来score数值
                #score = FC(tanh(FC(EO) + FC(H)))
                #FC = Fully connected (dense) layer
                #EO = Encoder output
                #H = hidden state
                #X = input to the decoder
                #e=score
                #V=400
                e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features), [2, 3])
                # Calculate attention distribution
                attn_dist = masked_attention(e)

                # Calculate the context vector from attn_dist and encoder_states
                #对attn_dist进行变形 由   变成   并乘以输入的encoder*state得到一个注意力分配模型
                #此处疑惑的是矩阵的形状 返回的atten_dist应该没用
                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states,
                    [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector, [-1, attn_size])

            return context_vector, attn_dist

        outputs = []
        attn_dists = []
        state = initial_state
        context_vector = array_ops.zeros([batch_size, attn_size])

        # Ensure the second shape of attention vectors is set.
        context_vector.set_shape([None, attn_size])

        if initial_state_attention:  # true in decode mode
            # Re-calculate the context vector from the previous step 
            # so that we can pass it through a linear layer with 
            # this step's input to get a modified version of the input
            context_vector, _ = attention(initial_state)
        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()

            # Merge input and previous attentions into one vector x of the same size as inp
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" % inp.name)
            x = linear([inp] + [context_vector], input_size, True)

            cell_output, state = cell(x, state)

            # Run the attention mechanism.
            if i == 0 and initial_state_attention:  # always true in decode mode
                with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                                   reuse=True):
                    context_vector, attn_dist = attention(state)
            else:
                context_vector, attn_dist = attention(state)
            attn_dists.append(attn_dist)

            # Concatenate the cell_output (= decoder state) and the context vector, 
            # and pass them through a linear layer
            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([cell_output] + [context_vector], cell.output_size, True)
            outputs.append(output)

        return outputs, state, attn_dists
Example #48
0
def attention_single_output_decoder(initial_state,
                                    attention_states,
                                    output_size=None,
                                    num_heads=1,
                                    dtype=dtypes.float32,
                                    scope=None,
                                    sequence_length=array_ops.ones([16]),
                                    initial_state_attention=True,
                                    use_attention=False):

  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())

  with variable_scope.variable_scope(scope or "decoder_single_output"):
#    print (initial_state.eval().shape)
    batch_size = array_ops.shape(initial_state)[0]  # Needed for reshaping.
#    print (attention_states.get_shape())
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_%d" % a,
                                      [1, 1, attn_size, attention_vec_size])
      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(variable_scope.get_variable("AttnV_%d" % a,
                                           [attention_vec_size]))

#     state = initial_state

    def attention(query, use_attention=False):
      """Put attention masks on hidden using hidden_features and query."""
      attn_weights = []
      ds = []  # Results of attention reads will be stored here.
      for i in xrange(num_heads):
        with variable_scope.variable_scope("Attention_%d" % i):
          y = rnn_cell._linear(query, attention_vec_size, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
          if use_attention is False: # apply mean pooling
              weights = tf.tile(sequence_length, tf.stack([attn_length]))
              weights = array_ops.reshape(weights, tf.shape(s))
              a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights)
              # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1])
          else:
            a = nn_ops.softmax(s)
          attn_weights.append(a)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          ds.append(array_ops.reshape(d, [-1, attn_size]))
      return attn_weights, ds

    batch_attn_size = array_ops.stack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])
    if initial_state_attention:
      attn_weights, attns = attention(initial_state, use_attention=use_attention)

    #with variable_scope.variable_scope(scope or "Linear"):
    matrix = variable_scope.get_variable("Out_Matrix", [attn_size, output_size])
    res = math_ops.matmul(attns[0], matrix) # NOTE: here we temporarily assume num_head = 1
    bias_start = 0.0
    bias_term = variable_scope.get_variable("Out_Bias", [output_size],
                                              initializer=init_ops.constant_initializer(bias_start))
    output = res + bias_term
  return attention_states, attn_weights[0], attns[0], [output] # NOTE: here we temporarily assume num_head = 1
 def _build():
     return (array_ops.ones([2, 2], dtype=dtype),
             array_ops.zeros([3, 3], dtype=dtype))
Example #50
0
def conjugate_gradient(operator,
                       rhs,
                       preconditioner=None,
                       x=None,
                       tol=1e-4,
                       max_iter=20,
                       name="conjugate_gradient"):
    r"""Conjugate gradient solver.

  Solves a linear system of equations `A*x = rhs` for selfadjoint, positive
  definite matrix `A` and righ-hand side vector `rhs`, using an iterative,
  matrix-free algorithm where the action of the matrix A is represented by
  `operator`. The iteration terminates when either the number of iterations
  exceeds `max_iter` or when the residual norm has been reduced to `tol`
  times its initial value, i.e. \\(||rhs - A x_k|| <= tol ||rhs||\\).

  Args:
    operator: An object representing a linear operator with attributes:
      - shape: Either a list of integers or a 1-D `Tensor` of type `int32` of
        length 2. `shape[0]` is the dimension on the domain of the operator,
        `shape[1]` is the dimension of the co-domain of the operator. On other
        words, if operator represents an N x N matrix A, `shape` must contain
        `[N, N]`.
      - dtype: The datatype of input to and output from `apply`.
      - apply: Callable object taking a vector `x` as input and returning a
        vector with the result of applying the operator to `x`, i.e. if
       `operator` represents matrix `A`, `apply` should return `A * x`.
    rhs: A rank-1 `Tensor` of shape `[N]` containing the right-hand size vector.
    preconditioner: An object representing a linear operator, see `operator`
      for detail. The preconditioner should approximate the inverse of `A`.
      An efficient preconditioner could dramatically improve the rate of
      convergence. If `preconditioner` represents matrix `M`(`M` approximates
      `A^{-1}`), the algorithm uses `preconditioner.apply(x)` to estimate
      `A^{-1}x`. For this to be useful, the cost of applying `M` should be
      much lower than computing `A^{-1}` directly.
    x: A rank-1 `Tensor` of shape `[N]` containing the initial guess for the
      solution.
    tol: A float scalar convergence tolerance.
    max_iter: An integer giving the maximum number of iterations.
    name: A name scope for the operation.

  Returns:
    output: A namedtuple representing the final state with fields:
      - i: A scalar `int32` `Tensor`. Number of iterations executed.
      - x: A rank-1 `Tensor` of shape `[N]` containing the computed solution.
      - r: A rank-1 `Tensor` of shape `[M]` containing the residual vector.
      - p: A rank-1 `Tensor` of shape `[N]`. `A`-conjugate basis vector.
      - gamma: \\(r \dot M \dot r\\), equivalent to  \\(||r||_2^2\\) when
        `preconditioner=None`.
  """
    # ephemeral class holding CG state.
    cg_state = collections.namedtuple("CGState", ["i", "x", "r", "p", "gamma"])

    def stopping_criterion(i, state):
        return math_ops.logical_and(i < max_iter,
                                    linalg_ops.norm(state.r) > tol)

    def cg_step(i, state):  # pylint: disable=missing-docstring
        z = operator.apply(state.p)
        alpha = state.gamma / util.dot(state.p, z)
        x = state.x + alpha * state.p
        r = state.r - alpha * z
        if preconditioner is None:
            gamma = util.dot(r, r)
            beta = gamma / state.gamma
            p = r + beta * state.p
        else:
            q = preconditioner.apply(r)
            gamma = util.dot(r, q)
            beta = gamma / state.gamma
            p = q + beta * state.p
        return i + 1, cg_state(i + 1, x, r, p, gamma)

    with ops.name_scope(name):
        n = operator.shape[1:]
        rhs = array_ops.expand_dims(rhs, -1)
        if x is None:
            x = array_ops.expand_dims(
                array_ops.zeros(n, dtype=rhs.dtype.base_dtype), -1)
            r0 = rhs
        else:
            x = array_ops.expand_dims(x, -1)
            r0 = rhs - operator.apply(x)
        if preconditioner is None:
            p0 = r0
        else:
            p0 = preconditioner.apply(r0)
        gamma0 = util.dot(r0, p0)
        tol *= linalg_ops.norm(r0)
        i = constant_op.constant(0, dtype=dtypes.int32)
        state = cg_state(i=i, x=x, r=r0, p=p0, gamma=gamma0)
        _, state = control_flow_ops.while_loop(stopping_criterion, cg_step,
                                               [i, state])
        return cg_state(state.i,
                        x=array_ops.squeeze(state.x),
                        r=array_ops.squeeze(state.r),
                        p=array_ops.squeeze(state.p),
                        gamma=state.gamma)
Example #51
0
def raw_rnn(cell, loop_fn, parallel_iterations=None, swap_memory=False, scope=None):
    """
    raw_rnn adapted from the original tensorflow implementation
    (https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/rnn.py)
    to emit arbitrarily nested states for each time step (concatenated along the time axis)
    in addition to the outputs at each timestep and the final state

    returns (
        states for all timesteps,
        outputs for all timesteps,
        final cell state,
    )
    """
    # if not _like_rnncell(cell):
    #if not assert_like_rnncell("error", cell):
    #    raise TypeError(f"cell must be an instance of RNNCell {type(cell)}")
    #if not callable(loop_fn):
    #    raise TypeError("loop_fn must be a callable")

    parallel_iterations = parallel_iterations or 32

    # Create a new scope in which the caching device is either
    # determined by the parent scope, or is set to place the cached
    # Variable using the same placement as for the rest of the RNN.
    with vs.variable_scope(scope or "rnn") as varscope:
        #if context.in_graph_mode():
        #    if varscope.caching_device is None:
        #        varscope.set_caching_device(lambda op: op.device)

        time = constant_op.constant(0, dtype=dtypes.int32)
        (elements_finished, next_input, initial_state, emit_structure,
         init_loop_state) = loop_fn(time, None, None, None)
        flat_input = nest.flatten(next_input)

        # Need a surrogate loop state for the while_loop if none is available.
        loop_state = (init_loop_state if init_loop_state is not None
                      else constant_op.constant(0, dtype=dtypes.int32))

        input_shape = [input_.get_shape() for input_ in flat_input]
        static_batch_size = input_shape[0][0]

        for input_shape_i in input_shape:
            # Static verification that batch sizes all match
            if static_batch_size:
                static_batch_size.merge_with(input_shape_i[0])
            else:
                static_batch_size = input_shape_i[0]

        batch_size = static_batch_size.value if static_batch_size else None
        const_batch_size = batch_size
        if batch_size is None:
            batch_size = array_ops.shape(flat_input[0])[0]

        nest.assert_same_structure(initial_state, cell.state_size)
        state = initial_state
        flat_state = nest.flatten(state)
        flat_state = [ops.convert_to_tensor(s) for s in flat_state]
        state = nest.pack_sequence_as(structure=state,
                                      flat_sequence=flat_state)

        if emit_structure is not None:
            flat_emit_structure = nest.flatten(emit_structure)
            flat_emit_size = [emit.shape if emit.shape.is_fully_defined() else
                              array_ops.shape(emit) for emit in flat_emit_structure]
            flat_emit_dtypes = [emit.dtype for emit in flat_emit_structure]
        else:
            emit_structure = cell.output_size
            flat_emit_size = nest.flatten(emit_structure)
            flat_emit_dtypes = [flat_state[0].dtype] * len(flat_emit_size)

        flat_state_size = [s.shape if s.shape.is_fully_defined() else
                           array_ops.shape(s) for s in flat_state]
        flat_state_dtypes = [s.dtype for s in flat_state]

        flat_emit_ta = [
            tensor_array_ops.TensorArray(
                dtype=dtype_i,
                dynamic_size=True,
                element_shape=(tensor_shape.TensorShape([const_batch_size])
                               .concatenate(_maybe_tensor_shape_from_tensor(size_i))),
                size=0,
                name="rnn_output_%d" % i
            )
            for i, (dtype_i, size_i) in enumerate(zip(flat_emit_dtypes, flat_emit_size))
        ]
        emit_ta = nest.pack_sequence_as(structure=emit_structure, flat_sequence=flat_emit_ta)
        flat_zero_emit = [
            array_ops.zeros(_concat(batch_size, size_i), dtype_i)
            for size_i, dtype_i in zip(flat_emit_size, flat_emit_dtypes)]

        zero_emit = nest.pack_sequence_as(structure=emit_structure, flat_sequence=flat_zero_emit)

        flat_state_ta = [
            tensor_array_ops.TensorArray(
                dtype=dtype_i,
                dynamic_size=True,
                element_shape=(tensor_shape.TensorShape([const_batch_size])
                               .concatenate(_maybe_tensor_shape_from_tensor(size_i))),
                size=0,
                name="rnn_state_%d" % i
            )
            for i, (dtype_i, size_i) in enumerate(zip(flat_state_dtypes, flat_state_size))
        ]
        state_ta = nest.pack_sequence_as(structure=state, flat_sequence=flat_state_ta)

        def condition(unused_time, elements_finished, *_):
            return math_ops.logical_not(math_ops.reduce_all(elements_finished))

        def body(time, elements_finished, current_input, state_ta, emit_ta, state, loop_state):
            (next_output, cell_state) = cell(current_input, state)

            nest.assert_same_structure(state, cell_state)
            nest.assert_same_structure(cell.output_size, next_output)

            next_time = time + 1
            (next_finished, next_input, next_state, emit_output,
             next_loop_state) = loop_fn(next_time, next_output, cell_state, loop_state)

            nest.assert_same_structure(state, next_state)
            nest.assert_same_structure(current_input, next_input)
            nest.assert_same_structure(emit_ta, emit_output)

            # If loop_fn returns None for next_loop_state, just reuse the previous one.
            loop_state = loop_state if next_loop_state is None else next_loop_state

            def _copy_some_through(current, candidate):
                """Copy some tensors through via array_ops.where."""
                def copy_fn(cur_i, cand_i):
                    # TensorArray and scalar get passed through.
                    if isinstance(cur_i, tensor_array_ops.TensorArray):
                        return cand_i
                    if cur_i.shape.ndims == 0:
                        return cand_i
                    # Otherwise propagate the old or the new value.
                    with ops.colocate_with(cand_i):
                        return array_ops.where(elements_finished, cur_i, cand_i)
                return nest.map_structure(copy_fn, current, candidate)

            emit_output = _copy_some_through(zero_emit, emit_output)
            next_state = _copy_some_through(state, next_state)

            emit_ta = nest.map_structure(lambda ta, emit: ta.write(time, emit), emit_ta, emit_output)
            state_ta = nest.map_structure(lambda ta, state: ta.write(time, state), state_ta, next_state)

            elements_finished = math_ops.logical_or(elements_finished, next_finished)

            return (next_time, elements_finished, next_input, state_ta,
                    emit_ta, next_state, loop_state)

        returned = control_flow_ops.while_loop(
            condition, body, loop_vars=[
                time, elements_finished, next_input, state_ta,
                emit_ta, state, loop_state],
            parallel_iterations=parallel_iterations,
            swap_memory=swap_memory
        )

        (state_ta, emit_ta, final_state, final_loop_state) = returned[-4:]

        flat_states = nest.flatten(state_ta)
        flat_states = [array_ops.transpose(ta.stack(), (1, 0, 2)) for ta in flat_states]
        states = nest.pack_sequence_as(structure=state_ta, flat_sequence=flat_states)

        flat_outputs = nest.flatten(emit_ta)
        flat_outputs = [array_ops.transpose(ta.stack(), (1, 0, 2)) for ta in flat_outputs]
        outputs = nest.pack_sequence_as(structure=emit_ta, flat_sequence=flat_outputs)

        return (states, outputs, final_state)
 def true_fn():
     return [
         constant_op.constant(1),
         TestTuple(constant_op.constant(2), [3, 4]),
         array_ops.zeros([5, 5]), 6
     ]
Example #53
0
 def __init__(self):
   super(SimpleModelWithOneVariable, self).__init__()
   self.var = variables.Variable(array_ops.zeros((1, 10), name='var'))
  def __call__(self, inputs, state, scope=None):
    # global h,e_ti,z_i,alpha_ti
    """Long short-term memory cell (LSTM)."""
    with vs.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
      # Parameters of gates are concatenated into one multiply for efficiency.
      if self._state_is_tuple:
        c, h = state
      else:
        c, h = array_ops.split(1, 2, state)
      ## seperate inputs into word imbedding and image subfeatures

      shape = inputs.get_shape().as_list()
      # print("shape ")
      # print(shape)
      batch_size = shape[0]
      print("inputs.get_shape[0]")
      print(batch_size)
      hsize=h.get_shape()
      print("hidden state length")
      print(hsize[1].value)
      #padded_length = shape[1].value
      single_input_length = shape[1]
      # print("single_input_length ")
      # print(single_input_length)
      word_imbedding_length = 512
      #subfeature_length = 192#(single_input_length-word_imbedding_length)/subfeature_num;
      subfeature_length = 768
      # subfeature_num = int((single_input_length-word_imbedding_length)/subfeature_length)
      # subfeature_num = 35*35
      subfeature_num = 17*17
      #batch_size_ops
      # batch_size=32
      tensorShape=tf.shape(inputs)
      #z_i = array_ops.zeros([batch_size,subfeature_length])

      z_i=array_ops.zeros(tf.pack([tensorShape[0],subfeature_length]))
      print('inputs:')
      print(inputs)
      # print("Initial z_i:")
      # print(z_i)
      state_length = self._num_units
      # with vs.variable_scope(scope or type(self).__name__,initializer=self._initializer):
      #f_att_matrix = vs.get_variable(name="f_att_matrix",shape = (subfeature_length,state_length), initializer=tf.contrib.layers.xavier_initializer(),dtype=tf.float32)
      mid_layer_size = 300
      W1 = vs.get_variable(name="w1",shape=(hsize[1].value+subfeature_length,mid_layer_size),initializer=tf.contrib.layers.xavier_initializer(),dtype=tf.float32)
      W2 = vs.get_variable(name="w2",shape=(mid_layer_size,1),initializer=tf.contrib.layers.xavier_initializer(),dtype=tf.float32)
      b1 = vs.get_variable(name="b1",shape=(1,mid_layer_size),initializer=tf.zeros_initializer,dtype=tf.float32)
      b2 = vs.get_variable(name="b2",shape=(1,1),initializer=tf.zeros_initializer,dtype=tf.float32)  
      word_imbeddings=inputs[:,0:word_imbedding_length]
      alpha_ti = []
      if single_input_length != word_imbedding_length:
        image_subfeatures=inputs[:,word_imbedding_length:single_input_length]
        #tf.summary.histogram("tensors/" + "subfeatures", image_subfeatures)
        #net2 = tf.reshape(net2, [shape2[0].value, -1, shape2[3].value])

        #image_subfeatures=array_ops.reshape(image_subfeatures,[batch_size,subfeature_num,subfeature_length])
        image_subfeatures=array_ops.reshape(image_subfeatures,tf.pack([tensorShape[0],subfeature_num,subfeature_length]))

        # f_att_matrix_exp=tf.expand_dims(f_att_matrix,0)
        # f_att_matrix_tile=tf.tile(f_att_matrix_exp,tf.pack([batch_size,1,1]))
        # print("fatt,fatt_exp,fatt_tile")
        # print(f_att_matrix)
        # print(f_att_matrix_exp)
        # print(f_att_matrix_tile)
        # tf.Print(f_att_matrix,[f_att_matrix])
        # h=tf.expand_dims(h,2)
        # e_ti = math_ops.matmul(math_ops.matmul(tf.sigmoid(image_subfeatures),f_att_matrix_tile),h)
        # e_ti =array_ops.zeros([batch_size,subfeature_num])
 
        W1_matrix=tf.expand_dims(W1,0) #[1,state_length+subfeature_length,mid_layer_size]
        W1_matrix=tf.tile(W1_matrix,tf.pack([tensorShape[0],1,1])) #[batchsize,state_length+subfeature_length,mid_layer_size]
        W2_matrix=tf.expand_dims(W2,0)
        W2_matrix=tf.tile(W2_matrix,tf.pack([tensorShape[0],1,1]))
        b1_matrix=tf.expand_dims(b1,0) #[1,1,mid_layer_size]     
        b1_matrix=tf.tile(b1_matrix,tf.pack([tensorShape[0],1,1]))
        b2_matrix=tf.expand_dims(b2,0) #[1,1,mid_layer_size]     
        b2_matrix=tf.tile(b2_matrix,tf.pack([tensorShape[0],1,1]))

        h_matrix=tf.expand_dims(h,1) # [batchsize,1,state_length]
        h_matrix=tf.tile(h_matrix,[1,subfeature_num,1]) #[batchsize,subfeature_num,state_length]
        x1 = tf.concat(2,[h_matrix,image_subfeatures]) #[batchsize,subfeature_num,state_length+subfeature_length]
        #x2 = tf.tanh(math_ops.matmul(x1,W1_matrix)+b1_matrix) #[batchsize,subfeature_num,mid_layer_size]
        x2 = tf.nn.relu(math_ops.matmul(x1,W1_matrix)+b1_matrix) #[batchsize,subfeature_num,mid_layer_size]
        #e_ti = tf.tanh(math_ops.matmul(x2,W2_matrix)+b2_matrix) #[batchsize,subfeature_num,1]
        e_ti = tf.nn.relu(math_ops.matmul(x2,W2_matrix)+b2_matrix) #[batchsize,subfeature_num,1]
        #e_ti = tf.squeeze(e_ti,[2]) #[batchsize,subfeature_num]
        alpha_ti = nn_ops.softmax(e_ti,dim=1) #[batchsize,subfeature_num,1]

        # e_ti=[]
        # for i in range(subfeature_num):
        #   x1 = tf.concat(1,[h,image_subfeatures[:,i,:]])
        #   x2 = tf.tanh(math_ops.matmul(x1,W1)+b1)
        #   x3 = tf.tanh(math_ops.matmul(x2,W2)+b2)
        #   e_ti.append(x3)
        # # e_ti = self.f_att(image_subfeatures,subfeature_length,h,scope)
        # print("x1")
        # print(x1)
        # print("x2")
        # print(x2)
        # print("x3")
        # print(x3)
        # e_ti=tf.transpose(tf.pack(e_ti),[1,0,2])
        # print("e_ti")
        # print(e_ti)
        # alpha_ti = nn_ops.softmax(e_ti)


        #tf.summary.histogram("tensors/" + "alpha_ti", alpha_ti)
        # z_i = math_ops.reduce_sum(math_ops.matmul(tf.transpose(image_subfeatures,[0,2,1]),alpha_ti),axis=1)
        z_i = math_ops.matmul(tf.transpose(image_subfeatures,[0,2,1]),alpha_ti)
        # h=tf.squeeze(h,[2])
        z_i=tf.squeeze(z_i,squeeze_dims=[2])
        print("squeezed z_i")
        print(z_i)
        #tf.summary.histogram("tensors/" + "z_i", z_i)
      #tf.summary.histogram("tensors/" + "h", h)
      concat = _linear([word_imbeddings, h, z_i], 4 * self._num_units, True) ###
      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
      i, j, f, o = array_ops.split(1, 4, concat)

      new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
               self._activation(j))
      new_h = self._activation(new_c) * sigmoid(o)

      if self._state_is_tuple:
        new_state = LSTMStateTuple(new_c, new_h)
      else:
        new_state = array_ops.concat(1, [new_c, new_h])
      return new_h, new_state, alpha_ti, z_i, word_imbeddings
Example #55
0
def _rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None,
         scope=None):
    """ Creates a recurrent neural network specified by RNNCell "cell".

    The simplest form of RNN network generated is:
      state = cell.zero_state(...)
      outputs = []
      states = []
      for input_ in inputs:
        output, state = cell(input_, state)
        outputs.append(output)
        states.append(state)
      return (outputs, states)

    However, a few other options are available:

    An initial state can be provided.
    If sequence_length is provided, dynamic calculation is performed.

    Dynamic calculation returns, at time t:
      (t >= max(sequence_length)
          ? (zeros(output_shape), zeros(state_shape))
          : cell(input, state)

    Thus saving computational time when unrolling past the max sequence length.

    Arguments:
      cell: An instance of RNNCell.
      inputs: A length T list of inputs, each a tensor of shape
        [batch_size, cell.input_size].
      initial_state: (optional) An initial state for the RNN.  This must be
        a tensor of appropriate type and shape [batch_size x cell.state_size].
      dtype: (optional) The data type for the initial state.  Required if
        initial_state is not provided.
      sequence_length: An int64 vector (tensor) size [batch_size].
      scope: VariableScope for the created subgraph; defaults to "RNN".

    Returns:
      A pair (outputs, states) where:
        outputs is a length T list of outputs (one for each input)
        states is a length T list of states (one state following each input)

    Raises:
      TypeError: If "cell" is not an instance of RNNCell.
      ValueError: If inputs is None or an empty list.
    """

    if not isinstance(cell, RNNCell):
        raise TypeError("cell must be an instance of RNNCell")
    if not isinstance(inputs, list):
        raise TypeError("inputs must be a list")
    if not inputs:
        raise ValueError("inputs must not be empty")

    outputs = []
    states = []
    batch_size = array_ops.shape(inputs[0])[0]
    if initial_state is not None:
        state = initial_state
    else:
        if not dtype:
            raise ValueError("If no initial_state is provided, dtype must be.")
        state = cell.zero_state(batch_size, dtype)

    if sequence_length:  # Prepare variables
        zero_output_state = (
            array_ops.zeros(array_ops.pack([batch_size, cell.output_size]),
                            inputs[0].dtype),
            array_ops.zeros(array_ops.pack([batch_size, cell.state_size]),
                            state.dtype))
        max_sequence_length = tf.reduce_max(sequence_length)

    for time, input_ in enumerate(inputs):
        def output_state():
            return cell(input_, state, scope)

        if sequence_length:
            (output, state) = control_flow_ops.cond(
                time >= max_sequence_length,
                lambda: zero_output_state, output_state)
        else:
            (output, state) = output_state()

        outputs.append(output)
        states.append(state)

    return (outputs, states)
Example #56
0
def _zeros_like(op_output):
    """Like array_ops.zeros_like() but also accepts resource var handles."""
    if op_output.dtype == dtypes.resource:
        return array_ops.zeros(
            gen_resource_variable_ops.variable_shape(op_output))
    return array_ops.zeros_like(op_output)
    def _solve(self, rhs, adjoint=False, adjoint_arg=False):
        # Here we follow the same use of Roth's column lemma as in `matmul`, with
        # the key difference that we replace all `matmul` instances with `solve`.
        # This follows from the property that inv(A x B) = inv(A) x inv(B).

        # Below we document the shape manipulation for adjoint=False,
        # adjoint_arg=False, but the general case of different adjoints is still
        # handled.

        if adjoint_arg:
            rhs = linalg.adjoint(rhs)

        # Always add a batch dimension to enable broadcasting to work.
        batch_shape = array_ops.concat(
            [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0)
        rhs += array_ops.zeros(batch_shape, dtype=rhs.dtype.base_dtype)

        # rhs has shape [B, R, C], where B represent some number of batch
        # dimensions,
        # R represents the number of rows, and C represents the number of columns.
        # In order to apply Roth's column lemma, we need to operate on a batch of
        # column vectors, so we reshape into a batch of column vectors. We put it
        # at the front to ensure that broadcasting between operators to the batch
        # dimensions B still works.
        output = _rotate_last_dim(rhs, rotate_right=True)

        # Also expand the shape to be [A, C, B, R]. The first dimension will be
        # used to accumulate dimensions from each operator matmul.
        output = output[array_ops.newaxis, ...]

        # In this loop, A is going to refer to the value of the accumulated
        # dimension. A = 1 at the start, and will end up being self.range_dimension.
        # V will refer to the last dimension. V = R at the start, and will end up
        # being 1 in the end.
        for operator in self.operators[:-1]:
            # Reshape output from [A, C, B, V] to be
            # [A, C, B, V / op.domain_dimension, op.domain_dimension]
            if adjoint:
                operator_dimension = operator.range_dimension_tensor()
            else:
                operator_dimension = operator.domain_dimension_tensor()

            output = _unvec_by(output, operator_dimension)

            # We are computing (XA^-1^T) = (A^-1 X^T)^T.
            # output has [A, C, B, V / op.domain_dimension, op.domain_dimension],
            # which is being converted to:
            # [A, C, B, V / op.domain_dimension, op.range_dimension]
            output = array_ops.matrix_transpose(output)
            output = operator.solve(output, adjoint=adjoint, adjoint_arg=False)
            output = array_ops.matrix_transpose(output)
            # Rearrange it to [A * op.range_dimension, C, B, V / op.domain_dimension]
            output = _rotate_last_dim(output, rotate_right=False)
            output = _vec(output)
            output = _rotate_last_dim(output, rotate_right=True)

        # After the loop, we will have
        # A = self.range_dimension / op[-1].range_dimension
        # V = op[-1].domain_dimension

        # We convert that using matvec to get:
        # [A, C, B, op[-1].range_dimension]
        output = self.operators[-1].solvevec(output, adjoint=adjoint)
        # Rearrange shape to be [B1, ... Bn, self.range_dimension, C]
        output = _rotate_last_dim(output, rotate_right=False)
        output = _vec(output)
        output = _rotate_last_dim(output, rotate_right=False)

        if rhs.shape.is_fully_defined():
            column_dim = rhs.shape[-1]
            broadcast_batch_shape = common_shapes.broadcast_shape(
                rhs.shape[:-2], self.batch_shape)
            if adjoint:
                matrix_dimensions = [self.domain_dimension, column_dim]
            else:
                matrix_dimensions = [self.range_dimension, column_dim]

            output.set_shape(
                broadcast_batch_shape.concatenate(matrix_dimensions))

        return output
Example #58
0
def _dynamic_rnn_loop(cell, inputs, initial_state, parallel_iterations,
                      swap_memory, sequence_length=None):
    """Internal implementation of Dynamic RNN.

    Arguments:
      cell: An instance of RNNCell.
      inputs: A `Tensor` of shape [time, batch_size, depth].
      initial_state: A `Tensor` of shape [batch_size, depth].
      parallel_iterations: Positive Python int.
      swap_memory: A Python boolean
      sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].

    Returns:
      Tuple (final_outputs, final_state).
      final_outputs:
        A `Tensor` of shape [time, batch_size, depth]`.
      final_state:
        A `Tensor` of shape [batch_size, depth].

    Raises:
      ValueError: If the input depth cannot be inferred via shape inference
        from the inputs.
    """
    state = initial_state
    assert isinstance(parallel_iterations,
                      int), "parallel_iterations must be int"

    # Construct an initial output
    input_shape = array_ops.shape(inputs)
    (time_steps, batch_size, _) = array_ops.unpack(input_shape, 3)

    inputs_got_shape = inputs.get_shape().with_rank(3)
    (const_time_steps, const_batch_size,
     const_depth) = inputs_got_shape.as_list()

    if const_depth is None:
        raise ValueError(
            "Input size (depth of inputs) must be accessible via shape inference, "
            "but saw value None.")

    # Prepare dynamic conditional copying of state & output
    zero_output = array_ops.zeros(
        array_ops.pack([batch_size, cell.output_size]), inputs.dtype)
    if sequence_length is not None:
        min_sequence_length = math_ops.reduce_min(sequence_length)
        max_sequence_length = math_ops.reduce_max(sequence_length)

    time = array_ops.constant(0, dtype=tf.int32, name="time")

    with ops.op_scope([], "dynamic_rnn") as scope:
        base_name = scope

    output_ta = tensor_array_ops.TensorArray(
        dtype=inputs.dtype, size=time_steps,
        tensor_array_name=base_name + "output")

    input_ta = tensor_array_ops.TensorArray(
        dtype=inputs.dtype, size=time_steps,
        tensor_array_name=base_name + "input")

    input_ta = input_ta.unpack(inputs)

    def _time_step(time, state, output_ta_t):
        """Take a time step of the dynamic RNN.
        Args:
          time: int32 scalar Tensor.
          state: Vector.
          output_ta_t: `TensorArray`, the output with existing flow.
        Returns:
          The tuple (time + 1, new_state, output_ta_t with updated flow).
        """

        input_t = input_ta.read(time)
        # Restore some shape information
        input_t.set_shape([const_batch_size, const_depth])

        call_cell = lambda: cell(input_t, state)

        if sequence_length is not None:
            (output, new_state) = _rnn_step(
                time=time,
                sequence_length=sequence_length,
                min_sequence_length=min_sequence_length,
                max_sequence_length=max_sequence_length,
                zero_output=zero_output,
                state=state,
                call_cell=call_cell,
                skip_conditionals=True)
        else:
            (output, new_state) = call_cell()

        output_ta_t = output_ta_t.write(time, output)

        return (time + 1, new_state, output_ta_t)

    (_, final_state, output_final_ta) = control_flow_ops.while_loop(
        cond=lambda time, _1, _2: time < time_steps,
        body=_time_step,
        loop_vars=(time, state, output_ta),
        parallel_iterations=parallel_iterations,
        swap_memory=swap_memory)

    final_outputs = output_final_ta.pack()
    # Restore some shape information
    final_outputs.set_shape([
        const_time_steps, const_batch_size, cell.output_size])

    return final_outputs, final_state
    def _matmul(self, x, adjoint=False, adjoint_arg=False):
        # Here we heavily rely on Roth's column Lemma [1]:
        # (A x B) * vec X = vec BXA^T,
        # where vec stacks all the columns of the matrix under each other. In our
        # case, x represents a batch of vec X (i.e. we think of x as a batch of
        # column vectors, rather than a matrix). Each member of the batch can be
        # reshaped to a matrix (hence we get a batch of matrices).
        # We can iteratively apply this lemma by noting that if B is a Kronecker
        # product, then we can apply the lemma again.

        # [1] W. E. Roth, "On direct product matrices,"
        # Bulletin of the American Mathematical Society, vol. 40, pp. 461-468,
        # 1934

        # Efficiency

        # Naively doing the Kronecker product, by calculating the dense matrix and
        # applying it will can take cubic time in  the size of domain_dimension
        # (assuming a square matrix). The other issue is that calculating the dense
        # matrix can be prohibitively expensive, in that it can take a large amount
        # of memory.
        #
        # This implementation avoids this memory blow up by only computing matmuls
        # with the factors. In this way, we don't have to realize the dense matrix.
        # In terms of complexity, if we have Kronecker Factors of size:
        # (n1, n1), (n2, n2), (n3, n3), ... (nJ, nJ), with N = \prod n_i, and we
        # have as input a [N, M] matrix, the naive approach would take O(N^2 M).
        # With this approach (ignoring reshaping of tensors and transposes for now),
        # the time complexity can be O(M * (\sum n_i) * N). There is also the
        # benefit of batched multiplication (In this example, the batch size is
        # roughly M * N) so this can be much faster. However, not factored in are
        # the costs of the several transposing of tensors, which can affect cache
        # behavior.

        # Below we document the shape manipulation for adjoint=False,
        # adjoint_arg=False, but the general case of different adjoints is still
        # handled.

        if adjoint_arg:
            x = linalg.adjoint(x)

        # Always add a batch dimension to enable broadcasting to work.
        batch_shape = array_ops.concat(
            [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0)
        x += array_ops.zeros(batch_shape, dtype=x.dtype.base_dtype)

        # x has shape [B, R, C], where B represent some number of batch dimensions,
        # R represents the number of rows, and C represents the number of columns.
        # In order to apply Roth's column lemma, we need to operate on a batch of
        # column vectors, so we reshape into a batch of column vectors. We put it
        # at the front to ensure that broadcasting between operators to the batch
        # dimensions B still works.
        output = _rotate_last_dim(x, rotate_right=True)

        # Also expand the shape to be [A, C, B, R]. The first dimension will be
        # used to accumulate dimensions from each operator matmul.
        output = output[array_ops.newaxis, ...]

        # In this loop, A is going to refer to the value of the accumulated
        # dimension. A = 1 at the start, and will end up being self.range_dimension.
        # V will refer to the last dimension. V = R at the start, and will end up
        # being 1 in the end.
        for operator in self.operators[:-1]:
            # Reshape output from [A, C, B, V] to be
            # [A, C, B, V / op.domain_dimension, op.domain_dimension]
            if adjoint:
                operator_dimension = operator.range_dimension_tensor()
            else:
                operator_dimension = operator.domain_dimension_tensor()

            output = _unvec_by(output, operator_dimension)

            # We are computing (XA^T) = (AX^T)^T.
            # output has [A, C, B, V / op.domain_dimension, op.domain_dimension],
            # which is being converted to:
            # [A, C, B, V / op.domain_dimension, op.range_dimension]
            output = array_ops.matrix_transpose(output)
            output = operator.matmul(output,
                                     adjoint=adjoint,
                                     adjoint_arg=False)
            output = array_ops.matrix_transpose(output)
            # Rearrange it to [A * op.range_dimension, C, B, V / op.domain_dimension]
            output = _rotate_last_dim(output, rotate_right=False)
            output = _vec(output)
            output = _rotate_last_dim(output, rotate_right=True)

        # After the loop, we will have
        # A = self.range_dimension / op[-1].range_dimension
        # V = op[-1].domain_dimension

        # We convert that using matvec to get:
        # [A, C, B, op[-1].range_dimension]
        output = self.operators[-1].matvec(output, adjoint=adjoint)
        # Rearrange shape to be [B1, ... Bn, self.range_dimension, C]
        output = _rotate_last_dim(output, rotate_right=False)
        output = _vec(output)
        output = _rotate_last_dim(output, rotate_right=False)

        if x.shape.is_fully_defined():
            column_dim = x.shape[-1]
            broadcast_batch_shape = common_shapes.broadcast_shape(
                x.shape[:-2], self.batch_shape)
            if adjoint:
                matrix_dimensions = [self.domain_dimension, column_dim]
            else:
                matrix_dimensions = [self.range_dimension, column_dim]

            output.set_shape(
                broadcast_batch_shape.concatenate(matrix_dimensions))

        return output
Example #60
0
    def _testScopedExport(self, test_dir, exported_filenames):
        graph = ops.Graph()
        with graph.as_default():
            # Creates an inference graph.
            # Hidden 1
            colocate_constraint = constant_op.constant(1.2, name="constraint")
            images = constant_op.constant(1.2,
                                          dtypes.float32,
                                          shape=[100, 28],
                                          name="images")
            with ops.name_scope("hidden1"):
                with graph.colocate_with(colocate_constraint.op):
                    weights1 = variables.Variable(random_ops.truncated_normal(
                        [28, 128], stddev=1.0 / math.sqrt(float(28))),
                                                  name="weights")
                # The use of control_flow_ops.cond here is purely for adding test
                # coverage the save and restore of control flow context (which doesn't
                # make any sense here from a machine learning perspective).  The typical
                # biases is a simple Variable without the conditions.
                biases1 = variables.Variable(control_flow_ops.cond(
                    math_ops.less(random.random(),
                                  0.5), lambda: array_ops.ones([128]),
                    lambda: array_ops.zeros([128])),
                                             name="biases")
                hidden1 = nn_ops.relu(
                    math_ops.matmul(images, weights1) + biases1)

            # Hidden 2
            with ops.name_scope("hidden2"):
                weights2 = variables.Variable(random_ops.truncated_normal(
                    [128, 32], stddev=1.0 / math.sqrt(float(128))),
                                              name="weights")

                # The use of control_flow_ops.while_loop here is purely for adding test
                # coverage the save and restore of control flow context (which doesn't
                # make any sense here from a machine learning perspective).  The typical
                # biases is a simple Variable without the conditions.
                def loop_cond(it, _):
                    return it < 2

                def loop_body(it, biases2):
                    biases2 += constant_op.constant(0.1, shape=[32])
                    return it + 1, biases2

                _, biases2 = control_flow_ops.while_loop(
                    loop_cond, loop_body, [
                        constant_op.constant(0),
                        variables.Variable(array_ops.zeros([32]),
                                           name="biases")
                    ])
                hidden2 = nn_ops.relu(
                    math_ops.matmul(hidden1, weights2) + biases2)
            # Linear
            with ops.name_scope("softmax_linear"):
                weights3 = variables.Variable(random_ops.truncated_normal(
                    [32, 10], stddev=1.0 / math.sqrt(float(32))),
                                              name="weights")
                biases3 = variables.Variable(array_ops.zeros([10]),
                                             name="biases")
                logits = math_ops.matmul(hidden2, weights3) + biases3
                ops.add_to_collection("logits", logits)

            # Exports each sub-graph.
            # Exports the first one with unbound_inputs_col_name set to default.
            orig_meta_graph1, var_list = meta_graph.export_scoped_meta_graph(
                filename=os.path.join(test_dir, exported_filenames[0]),
                graph=ops.get_default_graph(),
                export_scope="hidden1")
            self.assertEqual(["biases:0", "weights:0"],
                             sorted(var_list.keys()))
            var_names = [v.name for _, v in var_list.items()]
            self.assertEqual(["hidden1/biases:0", "hidden1/weights:0"],
                             sorted(var_names))

            # Exports the rest with no unbound_inputs_col_name.
            orig_meta_graph2, _ = meta_graph.export_scoped_meta_graph(
                filename=os.path.join(test_dir, exported_filenames[1]),
                graph=ops.get_default_graph(),
                export_scope="hidden2",
                unbound_inputs_col_name=None)
            orig_meta_graph3, _ = meta_graph.export_scoped_meta_graph(
                filename=os.path.join(test_dir, exported_filenames[2]),
                graph=ops.get_default_graph(),
                export_scope="softmax_linear",
                unbound_inputs_col_name=None)

        return [orig_meta_graph1, orig_meta_graph2, orig_meta_graph3]