def _LossFunc(): first_output, _ = cudnn_rnn.CudnnGRU(1, 100)( array_ops.zeros([28, 100, 28])) second_output, _ = cudnn_rnn.CudnnGRU(1, 100)( array_ops.zeros([28, 100, 100])) return (math_ops.reduce_sum(first_output) + math_ops.reduce_sum(second_output))
def testMultiRNNCellWithStateTuple(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m_bad = array_ops.zeros([1, 4]) m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])) # Test incorrectness of state with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"): core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(2) for _ in range(2)], state_is_tuple=True)(x, m_bad) _, ml = core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(2) for _ in range(2)], state_is_tuple=True)(x, m_good) sess.run([variables.global_variables_initializer()]) res = sess.run(ml, { x.name: np.array([[1., 1.]]), m_good[0].name: np.array([[0.1, 0.1]]), m_good[1].name: np.array([[0.1, 0.1]]) }) # The numbers in results were not calculated, this is just a # smoke test. However, these numbers should match those of # the test testMultiRNNCell. self.assertAllClose(res[0], [[0.175991, 0.175991]]) self.assertAllClose(res[1], [[0.13248, 0.13248]])
def testCoupledInputForgetGateLSTMCell(self): with self.test_session() as sess: num_units = 2 state_size = num_units * 2 batch_size = 3 input_size = 4 expected_output = np.array( [[0.121753, 0.121753], [0.103349, 0.103349], [0.100178, 0.100178]], dtype=np.float32) expected_state = np.array( [[0.137523, 0.137523, 0.121753, 0.121753], [0.105450, 0.105450, 0.103349, 0.103349], [0.100742, 0.100742, 0.100178, 0.100178]], dtype=np.float32) with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([batch_size, input_size]) m = array_ops.zeros([batch_size, state_size]) output, state = rnn_cell.CoupledInputForgetGateLSTMCell( num_units=num_units, forget_bias=1.0)(x, m) sess.run([variables.global_variables_initializer()]) res = sess.run([output, state], { x.name: np.array([[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]]), m.name: 0.1 * np.ones((batch_size, state_size)) }) # This is a smoke test: Only making sure expected values didn't change. self.assertEqual(len(res), 2) self.assertAllClose(res[0], expected_output) self.assertAllClose(res[1], expected_state)
def testDifferentShapesGraph(self): # Tests that a single kernel instance presented with multiple input shapes # does not crash with graph execution. with ops.device("gpu:0"): layer = cudnn_rnn.CudnnGRU(1, 100) layer(array_ops.zeros([28, 100, 100])) def _Cond(index, accumulation): del accumulation # unused return math_ops.less(index, 4) def _Body(index, accumulation): layer_input = accumulation[:, :, 10 * (1 + index % 2):] output, _ = layer(layer_input) return index + 1, accumulation + output original_input = array_ops.zeros([28, 100, 100]) _, accumulation = control_flow_ops.while_loop(_Cond, _Body, [0, original_input]) grad, = gradients.gradients( math_ops.reduce_sum(accumulation), (original_input,)) init_op = variables.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) accumulation_eval, grad_eval = sess.run((accumulation, grad)) self.assertAllEqual([28, 100, 100], accumulation_eval.shape) self.assertAllEqual([28, 100, 100], grad_eval.shape)
def testBasicLSTMCellWithDropout(self): def _is_close(x, y, digits=4): delta = x - y return delta < 10**(-digits) def _is_close_in(x, items, digits=4): for i in items: if _is_close(x, i, digits): return True return False keep_prob = 0.5 c_high = 2.9998924946 c_low = 0.999983298578 h_low = 0.761552567265 h_high = 0.995008519604 num_units = 5 allowed_low = [2, 3] with self.test_session() as sess: with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(1)): x = array_ops.zeros([1, 5]) c = array_ops.zeros([1, 5]) h = array_ops.zeros([1, 5]) state = core_rnn_cell_impl.LSTMStateTuple(c, h) cell = rnn_cell.LayerNormBasicLSTMCell( num_units, layer_norm=False, dropout_keep_prob=keep_prob) g, s = cell(x, state) sess.run([variables.global_variables_initializer()]) res = sess.run([g, s], { x.name: np.ones([1, 5]), c.name: np.ones([1, 5]), h.name: np.ones([1, 5]), }) # Since the returned tensors are of size [1,n] # get the first component right now. actual_h = res[0][0] actual_state_c = res[1].c[0] actual_state_h = res[1].h[0] # For each item in `c` (the cell inner state) check that # it is equal to one of the allowed values `c_high` (not # dropped out) or `c_low` (dropped out) and verify that the # corresponding item in `h` (the cell activation) is coherent. # Count the dropped activations and check that their number is # coherent with the dropout probability. dropped_count = 0 self.assertTrue((actual_h == actual_state_h).all()) for citem, hitem in zip(actual_state_c, actual_state_h): self.assertTrue(_is_close_in(citem, [c_low, c_high])) if _is_close(citem, c_low): self.assertTrue(_is_close(hitem, h_low)) dropped_count += 1 elif _is_close(citem, c_high): self.assertTrue(_is_close(hitem, h_high)) self.assertIn(dropped_count, allowed_low)
def zero_state(self, batch_size, dtype): """Return zero-filled state tensor(s). Args: batch_size: int, float, or unit Tensor representing the batch size. dtype: the data type to use for the state. Returns: If `state_size` is an int, then the return value is a `2-D` tensor of shape `[batch_size x state_size]` filled with zeros. If `state_size` is a nested list or tuple, then the return value is a nested list or tuple (of the same structure) of `2-D` tensors with the shapes `[batch_size x s]` for each s in `state_size`. """ state_size = self.state_size if _is_sequence(state_size): state_size_flat = _unpacked_state(state_size) zeros_flat = [ array_ops.zeros(array_ops.pack([batch_size, s]), dtype=dtype) for s in state_size_flat] for s, z in zip(state_size_flat, zeros_flat): z.set_shape([None, s]) zeros = _packed_state(structure=state_size, state=zeros_flat) else: zeros = array_ops.zeros( array_ops.pack([batch_size, state_size]), dtype=dtype) zeros.set_shape([None, state_size]) return zeros
def testDtype(self): with self.test_session(): d = array_ops.fill([2, 3], 12., name="fill") self.assertEqual(d.get_shape(), [2, 3]) # Test default type for both constant size and dynamic size z = array_ops.zeros([2, 3]) self.assertEqual(z.dtype, dtypes_lib.float32) self.assertEqual([2, 3], z.get_shape()) self.assertAllEqual(z.eval(), np.zeros([2, 3])) z = array_ops.zeros(array_ops.shape(d)) self.assertEqual(z.dtype, dtypes_lib.float32) self.assertEqual([2, 3], z.get_shape()) self.assertAllEqual(z.eval(), np.zeros([2, 3])) # Test explicit type control for dtype in [ dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32, dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.int8, dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.int64, dtypes_lib.bool, dtypes_lib.string ]: z = array_ops.zeros([2, 3], dtype=dtype) self.assertEqual(z.dtype, dtype) self.assertEqual([2, 3], z.get_shape()) z_value = z.eval() self.assertFalse(np.any(z_value)) self.assertEqual((2, 3), z_value.shape) z = array_ops.zeros(array_ops.shape(d), dtype=dtype) self.assertEqual(z.dtype, dtype) self.assertEqual([2, 3], z.get_shape()) z_value = z.eval() self.assertFalse(np.any(z_value)) self.assertEqual((2, 3), z_value.shape)
def _TestPostActivationBypassQuantized(self, is_training): graph = ops.Graph() with graph.as_default(): batch_size, height, width, depth = 5, 128, 128, 3 input1 = array_ops.zeros((batch_size, height, width, depth)) input2 = array_ops.zeros((batch_size, height / 2, width / 2, 32)) conv = conv2d( input1, 32, [5, 5], stride=2, padding='SAME', weights_initializer=self._WeightInit(0.09), activation_fn=array_ops.identity, scope='test/test') bypass_tensor = math_ops.add(conv, input2, name='test/add') _ = array_ops.identity(bypass_tensor, name='test/output') quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) # Ensure that the bypass node is preceded and followed by # FakeQuantWithMinMaxVars operations. self.assertTrue('FakeQuantWithMinMaxVars' in [c.type for c in bypass_tensor.consumers()]) self.assertTrue('FakeQuantWithMinMaxVars' in [i.op.type for i in bypass_tensor.op.inputs])
def testRank3InvalidShape2(self): indices = array_ops.zeros([2, 2, 1], dtypes.int32) updates = array_ops.zeros([2, 2], dtypes.int32) shape = np.array([2, 2, 2]) with self.assertRaisesWithPredicateMatch( ValueError, "The inner \\d+ dimensions of (input|output)\\.shape="): self.scatter_nd(indices, updates, shape)
def testBasicLSTMCellWithStateTuple(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m0 = array_ops.zeros([1, 4]) m1 = array_ops.zeros([1, 4]) cell = rnn_cell_impl.MultiRNNCell( [ rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False) for _ in range(2) ], state_is_tuple=True) g, (out_m0, out_m1) = cell(x, (m0, m1)) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g, out_m0, out_m1], { x.name: np.array([[1., 1.]]), m0.name: 0.1 * np.ones([1, 4]), m1.name: 0.1 * np.ones([1, 4]) }) self.assertEqual(len(res), 3) # The numbers in results were not calculated, this is just a smoke test. # Note, however, these values should match the original # version having state_is_tuple=False. self.assertAllClose(res[0], [[0.24024698, 0.24024698]]) expected_mem0 = np.array( [[0.68967271, 0.68967271, 0.44848421, 0.44848421]]) expected_mem1 = np.array( [[0.39897051, 0.39897051, 0.24024698, 0.24024698]]) self.assertAllClose(res[1], expected_mem0) self.assertAllClose(res[2], expected_mem1)
def testLSTMCell(self): with self.test_session() as sess: num_units = 8 num_proj = 6 state_size = num_units + num_proj batch_size = 3 input_size = 2 with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([batch_size, input_size]) m = array_ops.zeros([batch_size, state_size]) cell = rnn_cell_impl.LSTMCell( num_units=num_units, num_proj=num_proj, forget_bias=1.0, state_is_tuple=False) output, state = cell(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([output, state], { x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]), m.name: 0.1 * np.ones((batch_size, state_size)) }) self.assertEqual(len(res), 2) # The numbers in results were not calculated, this is mostly just a # smoke test. self.assertEqual(res[0].shape, (batch_size, num_proj)) self.assertEqual(res[1].shape, (batch_size, state_size)) # Different inputs so different outputs and states for i in range(1, batch_size): self.assertTrue( float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6) self.assertTrue( float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
def testGRUCell(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 2]) g, _ = rnn_cell_impl.GRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g], {x.name: np.array([[1., 1.]]), m.name: np.array([[0.1, 0.1]])}) # Smoke test self.assertAllClose(res[0], [[0.175991, 0.175991]]) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros( [1, 3]) # Test GRUCell with input_size != num_units. m = array_ops.zeros([1, 2]) g, _ = rnn_cell_impl.GRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g], {x.name: np.array([[1., 1., 1.]]), m.name: np.array([[0.1, 0.1]])}) # Smoke test self.assertAllClose(res[0], [[0.156736, 0.156736]])
def testBasicLSTMCellStateTupleType(self): with self.test_session(): with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m0 = (array_ops.zeros([1, 2]),) * 2 m1 = (array_ops.zeros([1, 2]),) * 2 cell = rnn_cell_impl.MultiRNNCell( [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)], state_is_tuple=True) self.assertTrue(isinstance(cell.state_size, tuple)) self.assertTrue( isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple)) self.assertTrue( isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple)) # Pass in regular tuples _, (out_m0, out_m1) = cell(x, (m0, m1)) self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple)) self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple)) # Pass in LSTMStateTuples variable_scope.get_variable_scope().reuse_variables() zero_state = cell.zero_state(1, dtypes.float32) self.assertTrue(isinstance(zero_state, tuple)) self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple)) self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple)) _, (out_m0, out_m1) = cell(x, zero_state) self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple)) self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
def testGridRNNEdgeCasesNoOutput(self): with self.test_session() as sess: with variable_scope.variable_scope( 'root', initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),) # This cell produces no output cell = grid_rnn_cell.GridRNNCell( num_units=2, num_dims=2, input_dims=0, output_dims=None, non_recurrent_dims=0, non_recurrent_fn=nn_ops.relu) g, s = cell(x, m) self.assertEqual(g, ()) self.assertEqual(s[0].c.get_shape(), (1, 2)) self.assertEqual(s[0].h.get_shape(), (1, 2)) sess.run([variables.global_variables_initializer()]) res_g, res_s = sess.run([g, s], { x: np.array([[1., 1.]]), m: ((np.array([[0.1, 0.1]]), np.array([[0.1, 0.1]])),) }) self.assertEqual(res_g, ()) self.assertEqual(res_s[0].c.shape, (1, 2)) self.assertEqual(res_s[0].h.shape, (1, 2))
def testGrid2LSTMCellLegacy(self): """Test for legacy case (when state_is_tuple=False).""" with self.test_session() as sess: with variable_scope.variable_scope( 'root', initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 3]) m = array_ops.zeros([1, 8]) cell = grid_rnn_cell.Grid2LSTMCell( 2, use_peepholes=True, state_is_tuple=False, output_is_tuple=False) self.assertEqual(cell.state_size, 8) g, s = cell(x, m) self.assertEqual(g.get_shape(), (1, 2)) self.assertEqual(s.get_shape(), (1, 8)) sess.run([variables.global_variables_initializer()]) res = sess.run([g, s], { x: np.array([[1., 1., 1.]]), m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]]) }) self.assertEqual(res[0].shape, (1, 2)) self.assertEqual(res[1].shape, (1, 8)) self.assertAllClose(res[0], [[0.95686918, 0.95686918]]) self.assertAllClose(res[1], [[ 2.41515064, 2.41515064, 0.95686918, 0.95686918, 1.38917875, 1.49043763, 0.83884692, 0.86036491 ]])
def testGrid2LSTMCellTied(self): with self.test_session(use_gpu=False) as sess: with variable_scope.variable_scope( 'root', initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 3]) m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])), (array_ops.zeros([1, 2]), array_ops.zeros([1, 2]))) cell = grid_rnn_cell.Grid2LSTMCell(2, tied=True, use_peepholes=True) self.assertEqual(cell.state_size, ((2, 2), (2, 2))) g, s = cell(x, m) self.assertEqual(g[0].get_shape(), (1, 2)) self.assertEqual(s[0].c.get_shape(), (1, 2)) self.assertEqual(s[0].h.get_shape(), (1, 2)) self.assertEqual(s[1].c.get_shape(), (1, 2)) self.assertEqual(s[1].h.get_shape(), (1, 2)) sess.run([variables.global_variables_initializer()]) res_g, res_s = sess.run([g, s], { x: np.array([[1., 1., 1.]]), m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])), (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]]))) }) self.assertEqual(res_g[0].shape, (1, 2)) self.assertEqual(res_s[0].c.shape, (1, 2)) self.assertEqual(res_s[0].h.shape, (1, 2)) self.assertEqual(res_s[1].c.shape, (1, 2)) self.assertEqual(res_s[1].h.shape, (1, 2)) self.assertAllClose(res_g[0], [[0.95686918, 0.95686918]]) self.assertAllClose( res_s, (([[2.41515064, 2.41515064]], [[0.95686918, 0.95686918]]), ([[1.38917875, 1.49043763]], [[0.83884692, 0.86036491]])))
def testGrid2BasicRNNCellTied(self): with self.test_session() as sess: with variable_scope.variable_scope( 'root', initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([2, 2]) m = (array_ops.zeros([2, 2]), array_ops.zeros([2, 2])) cell = grid_rnn_cell.Grid2BasicRNNCell(2, tied=True) self.assertEqual(cell.state_size, (2, 2)) g, s = cell(x, m) self.assertEqual(g[0].get_shape(), (2, 2)) self.assertEqual(s[0].get_shape(), (2, 2)) self.assertEqual(s[1].get_shape(), (2, 2)) sess.run([variables.global_variables_initializer()]) res_g, res_s = sess.run([g, s], { x: np.array([[1., 1.], [2., 2.]]), m: (np.array([[0.1, 0.1], [0.2, 0.2]]), np.array([[0.1, 0.1], [0.2, 0.2]])) }) self.assertEqual(res_g[0].shape, (2, 2)) self.assertEqual(res_s[0].shape, (2, 2)) self.assertEqual(res_s[1].shape, (2, 2)) self.assertAllClose(res_g, ([[0.94685763, 0.94685763], [0.99480951, 0.99480951]],)) self.assertAllClose( res_s, ([[0.94685763, 0.94685763], [0.99480951, 0.99480951]], [[0.80049908, 0.80049908], [0.97574311, 0.97574311]]))
def _possibly_broadcast_batch_shape(self, x): """Return 'x', possibly after broadcasting the leading dimensions.""" # If we have no batch shape, our batch shape broadcasts with everything! if self._batch_shape_arg is None: return x # Static attempt: # If we determine that no broadcast is necessary, pass x through # If we need a broadcast, add to an array of zeros. # # special_shape is the shape that, when broadcast with x's shape, will give # the correct broadcast_shape. Note that # We have already verified the second to last dimension of self.shape # matches x's shape in assert_compatible_matrix_dimensions. # Also, the final dimension of 'x' can have any shape. # Therefore, the final two dimensions of special_shape are 1's. special_shape = self.batch_shape.concatenate([1, 1]) bshape = array_ops.broadcast_static_shape(x.get_shape(), special_shape) if special_shape.is_fully_defined(): # bshape.is_fully_defined iff special_shape.is_fully_defined. if bshape == x.get_shape(): return x # Use the built in broadcasting of addition. zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype) return x + zeros # Dynamic broadcast: # Always add to an array of zeros, rather than using a "cond", since a # cond would require copying data from GPU --> CPU. special_shape = array_ops.concat((self.batch_shape_dynamic(), [1, 1]), 0) zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype) return x + zeros
def testLSTMBlockCell(self): with self.test_session(use_gpu=True, graph=ops.Graph()) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m0 = array_ops.zeros([1, 2]) m1 = array_ops.zeros([1, 2]) m2 = array_ops.zeros([1, 2]) m3 = array_ops.zeros([1, 2]) g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell( [lstm_ops.LSTMBlockCell(2) for _ in range(2)], state_is_tuple=True)(x, ((m0, m1), (m2, m3))) sess.run([variables.global_variables_initializer()]) res = sess.run([g, out_m0, out_m1, out_m2, out_m3], { x.name: np.array([[1., 1.]]), m0.name: 0.1 * np.ones([1, 2]), m1.name: 0.1 * np.ones([1, 2]), m2.name: 0.1 * np.ones([1, 2]), m3.name: 0.1 * np.ones([1, 2]) }) self.assertEqual(len(res), 5) self.assertAllClose(res[0], [[0.24024698, 0.24024698]]) # These numbers are from testBasicLSTMCell and only test c/h. self.assertAllClose(res[1], [[0.68967271, 0.68967271]]) self.assertAllClose(res[2], [[0.44848421, 0.44848421]]) self.assertAllClose(res[3], [[0.39897051, 0.39897051]]) self.assertAllClose(res[4], [[0.24024698, 0.24024698]])
def zero_state(self, batch_size, dtype): """Return zero-filled state tensor(s). Args: batch_size: int, float, or unit Tensor representing the batch size. dtype: the data type to use for the state. Returns: If `state_size` is an int or TensorShape, then the return value is a `N-D` tensor of shape `[batch_size x state_size]` filled with zeros. If `state_size` is a nested list or tuple, then the return value is a nested list or tuple (of the same structure) of `2-D` tensors with the shapes `[batch_size x s]` for each s in `state_size`. """ state_size = self.state_size if nest.is_sequence(state_size): state_size_flat = nest.flatten(state_size) zeros_flat = [ array_ops.zeros( array_ops.pack(_state_size_with_prefix(s, prefix=[batch_size])), dtype=dtype) for s in state_size_flat] for s, z in zip(state_size_flat, zeros_flat): z.set_shape(_state_size_with_prefix(s, prefix=[None])) zeros = nest.pack_sequence_as(structure=state_size, flat_sequence=zeros_flat) else: zeros_size = _state_size_with_prefix(state_size, prefix=[batch_size]) zeros = array_ops.zeros(array_ops.pack(zeros_size), dtype=dtype) zeros.set_shape(_state_size_with_prefix(state_size, prefix=[None])) return zeros
def testMultiplyInverseAgainstExplicit(self): with ops.Graph().as_default(), self.test_session() as sess: random_seed.set_random_seed(200) params = array_ops.zeros((2, 2, 2, 2)) inputs = array_ops.zeros((2, 2, 2, 2)) outputs = array_ops.zeros((2, 2, 2, 2)) block = fb.ConvKFCBasicFB(lc.LayerCollection(), params, (1, 1, 1, 1), 'SAME') block.register_additional_minibatch(inputs, outputs) grads = outputs**2 damping = 0. # This test is only valid without damping. block.instantiate_factors(([grads],), damping) sess.run(state_ops.assign(block._input_factor._cov, _make_psd(8))) sess.run(state_ops.assign(block._output_factor._cov, _make_psd(2))) sess.run(block._input_factor.make_inverse_update_ops()) sess.run(block._output_factor.make_inverse_update_ops()) v_flat = np.arange(16, dtype=np.float32) vector = utils.column_to_tensors(params, array_ops.constant(v_flat)) output = block.multiply_inverse(vector) output_flat = sess.run(utils.tensors_to_column(output)).ravel() full = sess.run(block.full_fisher_block()) explicit = np.dot(np.linalg.inv(full + damping * np.eye(16)), v_flat) self.assertAllClose(output_flat, explicit)
def testBasicLSTMCellWithStateTuple(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) c0 = array_ops.zeros([1, 2]) h0 = array_ops.zeros([1, 2]) state0 = core_rnn_cell_impl.LSTMStateTuple(c0, h0) c1 = array_ops.zeros([1, 2]) h1 = array_ops.zeros([1, 2]) state1 = core_rnn_cell_impl.LSTMStateTuple(c1, h1) cell = core_rnn_cell_impl.MultiRNNCell( [rnn_cell.LayerNormBasicLSTMCell(2) for _ in range(2)]) h, (s0, s1) = cell(x, (state0, state1)) sess.run([variables.global_variables_initializer()]) res = sess.run([h, s0, s1], { x.name: np.array([[1., 1.]]), c0.name: 0.1 * np.asarray([[0, 1]]), h0.name: 0.1 * np.asarray([[2, 3]]), c1.name: 0.1 * np.asarray([[4, 5]]), h1.name: 0.1 * np.asarray([[6, 7]]), }) expected_h = np.array([[-0.38079708, 0.38079708]]) expected_h0 = np.array([[-0.38079708, 0.38079708]]) expected_c0 = np.array([[-1.0, 1.0]]) expected_h1 = np.array([[-0.38079708, 0.38079708]]) expected_c1 = np.array([[-1.0, 1.0]]) self.assertEqual(len(res), 3) self.assertAllClose(res[0], expected_h, 1e-5) self.assertAllClose(res[1].c, expected_c0, 1e-5) self.assertAllClose(res[1].h, expected_h0, 1e-5) self.assertAllClose(res[2].c, expected_c1, 1e-5) self.assertAllClose(res[2].h, expected_h1, 1e-5)
def testBasicRNNCellNotTrainable(self): with self.test_session() as sess: def not_trainable_getter(getter, *args, **kwargs): kwargs["trainable"] = False return getter(*args, **kwargs) with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5), custom_getter=not_trainable_getter): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 2]) cell = rnn_cell_impl.BasicRNNCell(2) g, _ = cell(x, m) self.assertFalse(cell.trainable_variables) self.assertEqual([ "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME ], [v.name for v in cell.non_trainable_variables]) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g], { x.name: np.array([[1., 1.]]), m.name: np.array([[0.1, 0.1]]) }) self.assertEqual(res[0].shape, (1, 2))
def testBasicLSTMCell(self): for dtype in [dtypes.float16, dtypes.float32]: np_dtype = dtype.as_numpy_dtype with self.test_session(graph=ops.Graph()) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2], dtype=dtype) m = array_ops.zeros([1, 8], dtype=dtype) cell = rnn_cell_impl.MultiRNNCell( [ rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False) for _ in range(2) ], state_is_tuple=False) self.assertEqual(cell.dtype, None) g, out_m = cell(x, m) # Layer infers the input type. self.assertEqual(cell.dtype, dtype.name) expected_variable_names = [ "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME, "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME ] self.assertEqual(expected_variable_names, [v.name for v in cell.trainable_variables]) self.assertFalse(cell.non_trainable_variables) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g, out_m], { x.name: np.array([[1., 1.]]), m.name: 0.1 * np.ones([1, 8]) }) self.assertEqual(len(res), 2) variables = variables_lib.global_variables() self.assertEqual(expected_variable_names, [v.name for v in variables]) # The numbers in results were not calculated, this is just a # smoke test. self.assertAllClose(res[0], np.array( [[0.240, 0.240]], dtype=np_dtype), 1e-2) expected_mem = np.array( [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]], dtype=np_dtype) self.assertAllClose(res[1], expected_mem, 1e-2) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): # Test BasicLSTMCell with input_size != num_units. x = array_ops.zeros([1, 3], dtype=dtype) m = array_ops.zeros([1, 4], dtype=dtype) g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g, out_m], { x.name: np.array([[1., 1., 1.]], dtype=np_dtype), m.name: 0.1 * np.ones([1, 4], dtype=np_dtype) }) self.assertEqual(len(res), 2)
def testResidualWrapperWithSlice(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 5]) m = array_ops.zeros([1, 3]) base_cell = rnn_cell_impl.GRUCell(3) g, m_new = base_cell(x, m) variable_scope.get_variable_scope().reuse_variables() def residual_with_slice_fn(inp, out): inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3]) return inp_sliced + out g_res, m_new_res = rnn_cell_impl.ResidualWrapper( base_cell, residual_with_slice_fn)(x, m) sess.run([variables_lib.global_variables_initializer()]) res_g, res_g_res, res_m_new, res_m_new_res = sess.run( [g, g_res, m_new, m_new_res], { x: np.array([[1., 1., 1., 1., 1.]]), m: np.array([[0.1, 0.1, 0.1]]) }) # Residual connections self.assertAllClose(res_g_res, res_g + [1., 1., 1.]) # States are left untouched self.assertAllClose(res_m_new, res_m_new_res)
def testClusterSpecPropagationThreeServers2Graphs(self): """Boots 3 servers, creates 2 sessions, ensures appropriate operations. We create 2 clusterspecs: 1. server2 as the master, server1 as a worker 2. server2 as the master, server3 as a worker We ensure that variables on the workers are independent. """ server1 = server_lib.Server.create_local_server() server2 = server_lib.Server.create_local_server() server3 = server_lib.Server.create_local_server() cluster_def1 = cluster_pb2.ClusterDef() job1 = cluster_def1.job.add() job1.name = 'worker1' job1.tasks[0] = server2.target[len('grpc://'):] job1.tasks[1] = server1.target[len('grpc://'):] cluster_def2 = cluster_pb2.ClusterDef() job2 = cluster_def2.job.add() job2.name = 'worker2' job2.tasks[0] = server2.target[len('grpc://'):] job2.tasks[1] = server3.target[len('grpc://'):] config1 = config_pb2.ConfigProto(cluster_def=cluster_def1) config2 = config_pb2.ConfigProto(cluster_def=cluster_def2) with ops.Graph().as_default() as g1: with ops.device('/job:worker1/task:1'): var1 = variables.Variable(array_ops.zeros([2]), name='var1') update_op1 = state_ops.assign_add( var1, array_ops.ones([2]), name='var1_assign_add') init1 = variables.global_variables_initializer() with ops.Graph().as_default() as g2: with ops.device('/job:worker2/task:1'): var2 = variables.Variable(array_ops.zeros([2]), name='var2') update_op2 = state_ops.assign_add( var2, array_ops.ones([2]), name='var2_assign_add') init2 = variables.global_variables_initializer() sess1 = session.Session(server2.target, graph=g1, config=config1) sess2 = session.Session(server2.target, graph=g2, config=config2) init1.run(session=sess1) init2.run(session=sess2) expected_zeros = np.zeros([2]) expected_ones = np.ones([2]) self.assertAllEqual(expected_zeros, sess1.run(var1)) self.assertAllEqual(expected_zeros, sess2.run(var2)) self.assertAllEqual(expected_ones, sess1.run(update_op1)) self.assertAllEqual(expected_ones, sess1.run(var1)) self.assertAllEqual(expected_zeros, sess2.run(var2)) self.assertAllEqual(expected_ones, sess2.run(update_op2)) self.assertAllEqual(expected_ones + expected_ones, sess1.run(update_op1)) self.assertAllEqual(expected_ones, sess2.run(var2)) self.assertAllEqual(expected_ones + expected_ones, sess1.run(var1))
def _test_logits_helper(self, mode): """Tests that the expected logits are passed to mock head.""" with ops.Graph().as_default(): training_util.get_or_create_global_step() generator_inputs = {'x': array_ops.zeros([5, 4])} real_data = (None if mode == model_fn_lib.ModeKeys.PREDICT else array_ops.zeros([5, 4])) generator_scope_name = 'generator' head = mock_head(self, expected_generator_inputs=generator_inputs, expected_real_data=real_data, generator_scope_name=generator_scope_name) estimator_spec = estimator._gan_model_fn( features=generator_inputs, labels=real_data, mode=mode, generator_fn=generator_fn, discriminator_fn=discriminator_fn, generator_scope_name=generator_scope_name, head=head) with monitored_session.MonitoredTrainingSession( checkpoint_dir=self._model_dir) as sess: if mode == model_fn_lib.ModeKeys.TRAIN: sess.run(estimator_spec.train_op) elif mode == model_fn_lib.ModeKeys.EVAL: sess.run(estimator_spec.loss) elif mode == model_fn_lib.ModeKeys.PREDICT: sess.run(estimator_spec.predictions) else: self.fail('Invalid mode: {}'.format(mode))
def get_start_state(self): # State which matches the format we'll return later. Typically this will not # be used by the model directly, but the shapes and dtypes should match so # that the serving input_receiver_fn gets placeholder shapes correct. return (array_ops.zeros([self.input_window_size], dtype=dtypes.int64), array_ops.zeros( [self.input_window_size, self.num_features], dtype=self.dtype))
def _matmul(self, x, adjoint=False, adjoint_arg=False): if self._assert_proper_shapes: x = linalg.adjoint(x) if adjoint_arg else x aps = linear_operator_util.assert_compatible_matrix_dimensions(self, x) x = control_flow_ops.with_dependencies([aps], x) if self.is_square: # Note that adjoint has no effect since this matrix is self-adjoint. if adjoint_arg: output_shape = array_ops.concat([ array_ops.shape(x)[:-2], [array_ops.shape(x)[-1], array_ops.shape(x)[-2]]], axis=0) else: output_shape = array_ops.shape(x) return self._possibly_broadcast_batch_shape( array_ops.zeros(shape=output_shape, dtype=x.dtype)) x_shape = array_ops.shape(x) n = self._num_columns if adjoint else self._num_rows m = x_shape[-2] if adjoint_arg else x_shape[-1] output_shape = array_ops.concat([x_shape[:-2], [n, m]], axis=0) zeros = array_ops.zeros(shape=output_shape, dtype=x.dtype) return self._possibly_broadcast_batch_shape(zeros)
def testBlockGRUToGRUCellSingleStep(self): with self.session(use_gpu=True, graph=ops.Graph()) as sess: batch_size = 4 cell_size = 5 input_size = 6 seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) # Inputs x = array_ops.zeros([batch_size, input_size]) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_value = np.random.rand(batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = rnn_cell.GRUCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) basic_res = sess.run([output], {x: x_value, h: h_value}) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) block_res = sess.run([output], {x: x_value, h: h_value}) self.assertEqual(len(block_res), len(basic_res)) for block, basic in zip(block_res, basic_res): self.assertAllClose(block, basic)
def attention_decoder(decoder_inputs, initial_state, encoder_states, rel_scores, cell, initial_state_attention=False, pointer_gen=True, use_coverage=False, prev_coverage=None): """ Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. encoder_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. initial_state_attention: Note that this attention decoder passes each decoder input through a linear layer with the previous step's context vector to get a modified version of the input. If initial_state_attention is False, on the first decoder step the "previous context vector" is just a zero vector. If initial_state_attention is True, we use initial_state to (re)calculate the previous step's context vector. We set this to False for train/eval mode (because we call attention_decoder once for all decoder steps) and True for decode mode (because we call attention_decoder once for each decoder step). pointer_gen: boolean. If True, calculate the generation probability p_gen for each decoder step. use_coverage: boolean. If True, use coverage mechanism. prev_coverage: If not None, a tensor with shape (batch_size, attn_length). The previous step's coverage vector. This is only not None in decode mode when using coverage. Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x cell.output_size]. The output vectors. state: The final state of the decoder. A tensor shape [batch_size x cell.state_size]. attn_dists: A list containing tensors of shape (batch_size,attn_length). The attention distributions for each decoder step. p_gens: List of scalars. The values of p_gen for each decoder step. Empty list if pointer_gen=False. coverage: Coverage vector on the last step computed. None if use_coverage=False. """ with variable_scope.variable_scope("attention_decoder") as scope: batch_size = encoder_states.get_shape( )[0].value # if this line fails, it's because the batch size isn't defined attn_size = encoder_states.get_shape( )[2].value # if this line fails, it's because the attention length isn't defined # Reshape encoder_states (need to insert a dim) encoder_states = tf.expand_dims( encoder_states, axis=2) # now is shape (batch_size, attn_len, 1, attn_size) # To calculate attention, we calculate # v^T tanh(W_h h_i + W_s s_t + b_attn) # where h_i is an encoder state, and s_t a decoder state. # attn_vec_size is the length of the vectors v, b_attn, (W_h h_i) and (W_s s_t). # We set it to be equal to the size of the encoder states. attention_vec_size = attn_size # Get the weight matrix W_h and apply it to each encoder state to get (W_h h_i), the encoder features W_h = variable_scope.get_variable( "W_h", [1, 1, attn_size, attention_vec_size]) encoder_features = nn_ops.conv2d( encoder_states, W_h, [1, 1, 1, 1], "SAME") # shape (batch_size,attn_length,1,attention_vec_size) # Get the weight vectors v and w_c (w_c is for coverage) v = variable_scope.get_variable("v", [attention_vec_size]) if use_coverage: with variable_scope.variable_scope("coverage"): w_c = variable_scope.get_variable( "w_c", [1, 1, 1, attention_vec_size]) if prev_coverage is not None: # for beam search mode with coverage # reshape from (batch_size, attn_length) to (batch_size, attn_len, 1, 1) prev_coverage = tf.expand_dims(tf.expand_dims(prev_coverage, 2), 3) def attention(decoder_state, coverage=None): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1). Returns: context_vector: weighted sum of encoder_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, attn_len, 1, 1) """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear( decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) if use_coverage and coverage is not None: # non-first step of coverage # Multiply coverage vector by w_c to get coverage_features. coverage_features = nn_ops.conv2d( coverage, w_c, [1, 1, 1, 1], "SAME" ) # c has shape (batch_size, attn_length, 1, attention_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3]) # shape (batch_size,attn_length) #e = tf.multiply(e, rel_scores) # Take softmax of e to get the attention distribution attn_dist = nn_ops.softmax( e) # shape (batch_size, attn_length) # Update coverage vector coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e #e = tf.multiply(e, rel_scores) # Take softmax of e to get the attention distribution attn_dist = nn_ops.softmax( e) # shape (batch_size, attn_length) if use_coverage: # first step of training coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2), 2) # initialize coverage # Calculate the context vector from attn_dist and encoder_states context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist, coverage outputs = [] attn_dists = [] p_gens = [] state = initial_state coverage = prev_coverage # initialize coverage to None or whatever was passed in context_vector = array_ops.zeros([batch_size, attn_size]) context_vector.set_shape([ None, attn_size ]) # Ensure the second shape of attention vectors is set. if initial_state_attention: # true in decode mode # Re-calculate the context vector from the previous step so that we can pass it through a linear layer with this step's input to get a modified version of the input context_vector, _, coverage = attention( initial_state, coverage ) # in decode mode, this is what updates the coverage vector for i, inp in enumerate(decoder_inputs): tf.logging.info("Adding attention_decoder timestep %i of %i", i, len(decoder_inputs)) if i > 0: variable_scope.get_variable_scope().reuse_variables() # Merge input and previous attentions into one vector x of the same size as inp input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + [context_vector], input_size, True) # Run the decoder RNN cell. cell_output = decoder state cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: # always true in decode mode with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True ): # you need this because you've already run the initial attention(...) call context_vector, attn_dist, _ = attention( state, coverage) # don't allow coverage to update else: context_vector, attn_dist, coverage = attention( state, coverage) attn_dists.append(attn_dist) # Calculate p_gen if pointer_gen: with tf.variable_scope('calculate_pgen'): p_gen = linear([context_vector, state.c, state.h, x], 1, True) # a scalar p_gen = tf.sigmoid(p_gen) p_gens.append(p_gen) # Concatenate the cell_output (= decoder state) and the context vector, and pass them through a linear layer # This is V[s_t, h*_t] + b in the paper with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + [context_vector], cell.output_size, True) outputs.append(output) # If using coverage, reshape it if coverage is not None: coverage = array_ops.reshape(coverage, [batch_size, -1]) return outputs, state, attn_dists, p_gens, coverage
def experimental_tpu_fit_loop(model, dataset, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_dataset=None, validation_steps=None, validation_freq=1): """Fit loop for training with TPU tf.distribute.Strategy. Arguments: model: Keras Model instance. dataset: Dataset that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_dataset: Dataset for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.abc.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ mode = ModeKeys.TRAIN current_strategy = model._distribution_strategy iteration_value = min(steps_per_epoch, current_strategy.extended.steps_per_run) steps_per_run = K.variable( value=iteration_value, dtype='int32', name='steps_per_run') # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. iterator = dist_utils.get_iterator(dataset, current_strategy) scope = dist_utils.distributed_scope( strategy=current_strategy, learning_phase=1) scope.__enter__() out_labels = model.metrics_names or [] step_fn = _make_train_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels) # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = constant_op.constant(1e7) for m in model._get_training_eval_metrics(): tensor = m.result() initial_loop_values[m.name] = array_ops.zeros(tensor.shape, tensor.dtype) ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) if model._compile_distribution: dist_utils._copy_weights_to_distributed_model(model, mode) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose, count_mode='steps', mode=mode) # Calculate the steps each time on the device. steps_to_run = ([current_strategy.extended.steps_per_run] * (steps_per_epoch // current_strategy.extended.steps_per_run)) if steps_per_epoch % current_strategy.extended.steps_per_run: steps_to_run.append( steps_per_epoch % current_strategy.extended.steps_per_run) target_steps = len(steps_to_run) callbacks._call_begin_hook(mode) initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode) for epoch in range(initial_epoch, epochs): dist_utils._reset_metrics(model) callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None current_step = 0 while current_step < target_steps: step_count = steps_to_run[current_step] batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count} callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: K.get_session().run(steps_per_run.assign(step_count)) prev_step_count = step_count try: _, outputs = K.batch_get_value([train_op, output_tensors]) except errors.OutOfRangeError: logging.warning('Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) break batch_logs.update(outputs) callbacks._call_batch_hook(mode, 'end', step_index, batch_logs) step_index = step_index + step_count current_step += 1 if callbacks.model.stop_training: break if (do_validation and training_utils.should_run_validation(validation_freq, epoch)): logging.info('Running validation at fit epoch: %s', epoch) if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN) val_outs = experimental_tpu_test_loop( # pylint: disable=undefined-variable model, val_dataset, steps=validation_steps, verbose=verbose, callbacks=callbacks) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break model._successful_loop_finish = True callbacks._call_end_hook(mode) if model._compile_distribution: # Copy the weights back from the replicated model to the original model. dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN) scope.__exit__(None, None, None) return model.history
def _prepare_local(self, var_device, var_dtype, apply_state): super(Adagrad, self)._prepare_local(var_device, var_dtype, apply_state) apply_state[(var_device, var_dtype)].update( dict(epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype), neg_lr_t=-apply_state[(var_device, var_dtype)]['lr_t'], zero=array_ops.zeros((), dtype=dtypes.int64)))
def __call__(self, x): current_sum = array_ops.zeros([], dtype=dtypes.int64) for element in self.dataset: current_sum += x * element return current_sum
def func(x, dtype=None): if dtype: return array_ops.zeros(shape=x.shape, dtype=dtype) else: return array_ops.zeros(shape=x.shape, dtype=dtypes.float32)
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] inputs = [] var_list = [] for x in grads_and_vars: inputs.extend(list(x)) with ops.device(global_step.device): self._local_steps = variables.Variable(array_ops.zeros( [self._total_num_replicas], dtype=global_step.dtype), trainable=False, name="local_steps") # Check staleness. Note that this has to be ref(), otherwise identity will # be accessed and it will be old values. local_step = array_ops.slice(self._local_steps.ref(), array_ops.reshape(self._replica_id, (1, )), [1], name="get_local_step") local_step = array_ops.reshape(local_step, ()) is_stale = math_ops.less(local_step, global_step) with ops.op_scope(inputs, None, self._name): for grad, var in grads_and_vars: var_list.append(var) with ops.device(var.device): if isinstance(grad, ops.Tensor): gradient_queue = (data_flow_ops.FIFOQueue( self._tokens_per_step * 2, grad.dtype, shapes=var.get_shape(), shared_name=var.name)) self._one_element_queue_list.append( (gradient_queue, var.device)) train_ops.append(gradient_queue.enqueue([grad])) # Aggregate all gradients gradients = gradient_queue.dequeue_many( self._replicas_to_aggregate) aggregated_grad.append( math_ops.reduce_sum(gradients, [0])) elif grad is None: aggregated_grad.append(None) # pass-through. else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") aggregated_grad.append( self._aggregate_sparse_grad(grad, var, train_ops)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) # Create token queue. with ops.device(global_step.device), ops.name_scope(""): sync_token_queue = (data_flow_ops.FIFOQueue( -1, global_step.dtype.base_dtype, shapes=(), shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) # Clear all the gradients queues in case there are stale gradients. clear_queue_ops = [] with ops.control_dependencies([update_op]): for queue, dev in self._one_element_queue_list: with ops.device(dev): stale_grads = queue.dequeue_many(queue.size()) clear_queue_ops.append(stale_grads) for queue, dev in self._sparse_grad_queues_and_devs: with ops.device(dev): _, stale_indices = queue.dequeue_many(queue.size()) clear_queue_ops.append(stale_indices) with ops.device(global_step.device): self._clean_up_op = control_flow_ops.abort( error_msg="From sync_replicas") # According to the staleness, select between the enqueue op (real_grad) # or no-op (no_op_grad). Effectively dropping all the stale gradients. no_op_grad = lambda: [ control_flow_ops.no_op(name="no_grad_enqueue") ] real_grad = lambda: [control_flow_ops.group(*train_ops)] final_train_ops = control_flow_ops.cond(is_stale, no_op_grad, real_grad) with ops.device(global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies([final_train_ops]): token = sync_token_queue.dequeue() train_op = state_ops.scatter_update( self._local_steps, self._replica_id, token) with ops.control_dependencies(clear_queue_ops): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. # Note that ref() is used to avoid reading from the identity with old # the step. tokens = array_ops.fill([self._tokens_per_step], global_step.ref()) sync_op = sync_token_queue.enqueue_many((tokens, )) if self._variable_averages is not None: with ops.control_dependencies([sync_op ]), ops.name_scope(""): sync_op = self._variable_averages.apply( self._variables_to_average) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) self._gradients_applied = True return train_op
def testClipByNormGradientZeros(self): with self.session(use_gpu=True): x = array_ops.zeros([3]) b = clip_ops.clip_by_norm(x, 1.) grad, = gradients_impl.gradients(b, x) self.assertAllEqual(grad.eval(), [1., 1., 1.])
def _SignGrad(op, _): """Returns 0.""" x = op.inputs[0] return array_ops.zeros(array_ops.shape(x), dtype=x.dtype)
def __init__(self, loc, scale, skewness=None, tailweight=None, distribution=None, validate_args=False, allow_nan_stats=True, name="SinhArcsinh"): """Construct SinhArcsinh distribution on `(-inf, inf)`. Arguments `(loc, scale, skewness, tailweight)` must have broadcastable shape (indexing batch dimensions). They must all have the same `dtype`. Args: loc: Floating-point `Tensor`. scale: `Tensor` of same `dtype` as `loc`. skewness: Skewness parameter. Default is `0.0` (no skew). tailweight: Tailweight parameter. Default is `1.0` (unchanged tailweight) distribution: `tf.Distribution`-like instance. Distribution that is transformed to produce this distribution. Default is `tfp.distributions.Normal(0., 1.)`. Must be a scalar-batch, scalar-event distribution. Typically `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is a function of non-trainable parameters. WARNING: If you backprop through a `SinhArcsinh` sample and `distribution` is not `FULLY_REPARAMETERIZED` yet is a function of trainable variables, then the gradient will be incorrect! validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. """ parameters = dict(locals()) with ops.name_scope(name, values=[loc, scale, skewness, tailweight]) as name: loc = ops.convert_to_tensor(loc, name="loc") dtype = loc.dtype scale = ops.convert_to_tensor(scale, name="scale", dtype=dtype) tailweight = 1. if tailweight is None else tailweight has_default_skewness = skewness is None skewness = 0. if skewness is None else skewness tailweight = ops.convert_to_tensor(tailweight, name="tailweight", dtype=dtype) skewness = ops.convert_to_tensor(skewness, name="skewness", dtype=dtype) batch_shape = distribution_util.get_broadcast_shape( loc, scale, tailweight, skewness) # Recall, with Z a random variable, # Y := loc + C * F(Z), # F(Z) := Sinh( (Arcsinh(Z) + skewness) * tailweight ) # F_0(Z) := Sinh( Arcsinh(Z) * tailweight ) # C := 2 * scale / F_0(2) if distribution is None: distribution = normal.Normal(loc=array_ops.zeros([], dtype=dtype), scale=array_ops.ones([], dtype=dtype), allow_nan_stats=allow_nan_stats) else: asserts = distribution_util.maybe_check_scalar_distribution( distribution, dtype, validate_args) if asserts: loc = control_flow_ops.with_dependencies(asserts, loc) # Make the SAS bijector, 'F'. f = bijectors.SinhArcsinh(skewness=skewness, tailweight=tailweight) if has_default_skewness: f_noskew = f else: f_noskew = bijectors.SinhArcsinh( skewness=skewness.dtype.as_numpy_dtype(0.), tailweight=tailweight) # Make the AffineScalar bijector, Z --> loc + scale * Z (2 / F_0(2)) c = 2 * scale / f_noskew.forward( ops.convert_to_tensor(2, dtype=dtype)) affine = bijectors.AffineScalar(shift=loc, scale=c, validate_args=validate_args) bijector = bijectors.Chain([affine, f]) super(SinhArcsinh, self).__init__(distribution=distribution, bijector=bijector, batch_shape=batch_shape, validate_args=validate_args, name=name) self._parameters = parameters self._loc = loc self._scale = scale self._tailweight = tailweight self._skewness = skewness
def _TestFoldDepthwiseConv2d(self, relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm): """Tests folding: inputs -> DepthwiseConv2d with batch norm -> Relu*. Args: relu: Callable that returns an Operation, a factory method for the Relu*. relu_op_name: String, name of the Relu* operation. with_bypass: Bool, when true there is an extra connection added from inputs to just before Relu*. has_scaling: Bool, when true the batch norm has scaling. fused_batch_norm: Bool, when true the batch norm is fused. """ g = ops.Graph() with g.as_default(): batch_size, height, width = 5, 128, 128 inputs = array_ops.zeros((batch_size, height, width, 3)) stride = 1 if with_bypass else 2 activation_fn = None if with_bypass else relu scope = 'test/test2' if with_bypass else 'test' node = separable_conv2d( inputs, None, [5, 5], stride=stride, depth_multiplier=1.0, padding='SAME', weights_initializer=self._WeightInit(0.09), activation_fn=activation_fn, normalizer_fn=batch_norm, normalizer_params=self._BatchNormParams( scale=has_scaling, fused=fused_batch_norm), scope=scope) if with_bypass: node = math_ops.add(inputs, node, name='test/Add') relu(node, name='test/' + relu_op_name) fold_batch_norms.FoldBatchNorms(g) folded_mul = g.get_operation_by_name(scope + '/mul_fold') self.assertEqual(folded_mul.type, 'Mul') if fused_batch_norm: scale_reshape_op_name = scope + '/BatchNorm_Fold/scale_reshape' else: scale_reshape_op_name = scope + '/scale_reshape' self._AssertInputOpsAre(folded_mul, [scope + '/depthwise_weights/read', scale_reshape_op_name]) self._AssertOutputGoesToOps(folded_mul, g, [scope + '/depthwise_Fold']) scale_reshape = g.get_operation_by_name(scale_reshape_op_name) self.assertEqual(scale_reshape.type, 'Reshape') self._AssertInputOpsAre(scale_reshape, [ self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm), scale_reshape_op_name + '/shape' ]) self._AssertOutputGoesToOps(scale_reshape, g, [scope + '/mul_fold']) folded_conv = g.get_operation_by_name(scope + '/depthwise_Fold') self.assertEqual(folded_conv.type, 'DepthwiseConv2dNative') self._AssertInputOpsAre(folded_conv, [scope + '/mul_fold', inputs.op.name]) self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold']) folded_add = g.get_operation_by_name(scope + '/add_fold') self.assertEqual(folded_add.type, 'Add') self._AssertInputOpsAre(folded_add, [ scope + '/depthwise_Fold', self._BathNormBiasName(scope, fused_batch_norm) ]) output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name] self._AssertOutputGoesToOps(folded_add, g, output_op_names)
def testSmokeScatterNdBatch2DSliceDim3ShapeRank7(self): with self.test_session(): indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32) values = array_ops.zeros([1, 2, 6, 7, 8, 9]) shape = [3, 4, 5, 6, 7, 8, 9] self.scatter_nd(indices, values, shape).eval()
def _ResizeBilinearGrad(op, grads): return (array_ops.zeros(shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype), tf.raw_ops.ResizeBilinearGrad(grads=grads, original_image=op.inputs[1]))
def testLSTMBasicToBlockCellPeeping(self): with self.test_session(use_gpu=True) as sess: x = array_ops.zeros([1, 2]) x_values = np.random.randn(1, 2) m0_val = 0.1 * np.ones([1, 2]) m1_val = -0.1 * np.ones([1, 2]) m2_val = -0.2 * np.ones([1, 2]) m3_val = 0.2 * np.ones([1, 2]) initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212) with variable_scope.variable_scope("basic", initializer=initializer): m0 = array_ops.zeros([1, 2]) m1 = array_ops.zeros([1, 2]) m2 = array_ops.zeros([1, 2]) m3 = array_ops.zeros([1, 2]) g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell( [ rnn_cell.LSTMCell( 2, use_peepholes=True, state_is_tuple=True) for _ in range(2) ], state_is_tuple=True)(x, ((m0, m1), (m2, m3))) sess.run([variables.global_variables_initializer()]) basic_res = sess.run( [g, out_m0, out_m1, out_m2, out_m3], { x.name: x_values, m0.name: m0_val, m1.name: m1_val, m2.name: m2_val, m3.name: m3_val }) with variable_scope.variable_scope("block", initializer=initializer): m0 = array_ops.zeros([1, 2]) m1 = array_ops.zeros([1, 2]) m2 = array_ops.zeros([1, 2]) m3 = array_ops.zeros([1, 2]) g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell( [ lstm_ops.LSTMBlockCell(2, use_peephole=True) for _ in range(2) ], state_is_tuple=True)(x, ((m0, m1), (m2, m3))) sess.run([variables.global_variables_initializer()]) block_res = sess.run( [g, out_m0, out_m1, out_m2, out_m3], { x.name: x_values, m0.name: m0_val, m1.name: m1_val, m2.name: m2_val, m3.name: m3_val }) self.assertEqual(len(basic_res), len(block_res)) for basic, block in zip(basic_res, block_res): self.assertAllClose(basic, block)
def scatter_nd(self, indices, updates, shape, input_=None): input_ = (input_ if input_ is not None else array_ops.zeros( shape, dtype=updates.dtype)) return array_ops.scatter_nd_non_aliasing_add(input_, indices, updates)
def testZeroLengthDim(self): x = array_ops.zeros(shape=(0, 1, 2)) y = self.evaluate(array_ops.unstack(x, axis=1)[0]) self.assertEqual(y.shape, (0, 2))
def testSmokeScatterNdBatch1DSliceDim2(self): with self.test_session(): indices = array_ops.zeros([0, 2], dtype=dtypes.int32) values = array_ops.zeros([0, 7]) shape = [4, 6, 7] self.scatter_nd(indices, values, shape).eval()
def attention_decoder(decoder_inputs, initial_state, encoder_states, enc_padding_mask, cell, initial_state_attention=False): with variable_scope.variable_scope("attention_decoder") as scope: # if this line fails, it's because the batch size isn't defined #对encoder_state 进行split分成对应每个batch_size 下的每一行的 batch_size = encoder_states.get_shape()[0].value # if this line fails, it's because the attention length isn't defined attn_size = encoder_states.get_shape()[2].value print(attn_size) #这里不清数为什么做expend_dim # shape (batch_size, attn_len, 1, attn_size) encoder_states = tf.expand_dims(encoder_states, axis=2) attention_vec_size = attn_size #此处是需要测试出来W_h的具体形状和数值 #W_h(1,1,400,400) W_h = variable_scope.get_variable("W_h", [1, 1, attn_size, attention_vec_size]) # shape (batch_size,attn_length,1,attention_vec_size) #此处自己做了实验百分之百不是仅仅的做维度变化,做了数值变化 #代码见: #不明白为什么要做卷积变换 #此处对encoder_state进行变形,此处变形用的是卷积,用encoder_state对一个W_h [shape,shape],做卷积变形。此处数值应该出现变化,而不是单单的矩阵变形。 encoder_features = nn_ops.conv2d(encoder_states, W_h, [1, 1, 1, 1], "SAME") #此处初始化赋值的疑惑:tf.layers.conv2d中的权重初始化参数默认为None,但是就算不给参数也可以正常训练, #经过查看源码发现tf.layers.conv2d是从tf.keras.layer.Conv继承来的,在父类中对初始化进行了定义, #kernel_initializer='glorot_uniform'对卷积核参数进行均匀初始化 #f(x)*g(t-x)在此处做卷积变换,疑惑是声明变量的数值是多少 # Get the weight vectors v and w_c (w_c is for coverage) #此处的V是直接定义的一个矩阵。但是没有进行权重初始化 #标准的写法不是这种,是利用生成一个神经网络层进行初始化,但是此处应该雇佣纠结初始化数值问题,用的nn_ops库中的矩阵在initializer的时候能够自动初始化。相当于同tf例子中的作用一样,不一样的函数表达式 v = variable_scope.get_variable("v", [attention_vec_size]) #此处是使用一个BahdanauAttention算法来分配注意力 def attention(decoder_state): with variable_scope.variable_scope("attention"): # Pass the decoder state through a linear layer # shape (batch_size, attention_vec_size) #linear的作用做a*decode_state+bias 这里面感觉自己的理解是错误的? #decoder_features=[32,200] #decoder_state=decoder_state #linear(args, output_size, bias, bias_start=0.0, scope=None): #此处没搞明白参数的对应关系。 # decoder_state attention_vec_size print(decoder_state) #此处linear没做审什么变动知识做了list tuple检查,如果decoder_state形式正确,decoder没任何变化 decoder_features = linear(decoder_state, attention_vec_size, True) # reshape to (batch_size, 1, 1, attention_vec_size) #此处一直没明白为什么要reshape,此处运行是要大量时间,是为了? decoder_features = tf.expand_dims(tf.expand_dims(decoder_features, 1), 1) def masked_attention(e): """Take softmax of e then apply enc_padding_mask and re-normalize""" attn_dist = nn_ops.softmax(e) # take softmax. shape (batch_size, attn_length) attn_dist *= enc_padding_mask # apply mask masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size) return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)求出来score数值 #score = FC(tanh(FC(EO) + FC(H))) #FC = Fully connected (dense) layer #EO = Encoder output #H = hidden state #X = input to the decoder #e=score #V=400 e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # Calculate attention distribution attn_dist = masked_attention(e) # Calculate the context vector from attn_dist and encoder_states #对attn_dist进行变形 由 变成 并乘以输入的encoder*state得到一个注意力分配模型 #此处疑惑的是矩阵的形状 返回的atten_dist应该没用 context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist outputs = [] attn_dists = [] state = initial_state context_vector = array_ops.zeros([batch_size, attn_size]) # Ensure the second shape of attention vectors is set. context_vector.set_shape([None, attn_size]) if initial_state_attention: # true in decode mode # Re-calculate the context vector from the previous step # so that we can pass it through a linear layer with # this step's input to get a modified version of the input context_vector, _ = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # Merge input and previous attentions into one vector x of the same size as inp input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + [context_vector], input_size, True) cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: # always true in decode mode with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): context_vector, attn_dist = attention(state) else: context_vector, attn_dist = attention(state) attn_dists.append(attn_dist) # Concatenate the cell_output (= decoder state) and the context vector, # and pass them through a linear layer with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + [context_vector], cell.output_size, True) outputs.append(output) return outputs, state, attn_dists
def attention_single_output_decoder(initial_state, attention_states, output_size=None, num_heads=1, dtype=dtypes.float32, scope=None, sequence_length=array_ops.ones([16]), initial_state_attention=True, use_attention=False): if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) with variable_scope.variable_scope(scope or "decoder_single_output"): # print (initial_state.eval().shape) batch_size = array_ops.shape(initial_state)[0] # Needed for reshaping. # print (attention_states.get_shape()) attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) # state = initial_state def attention(query, use_attention=False): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) if use_attention is False: # apply mean pooling weights = tf.tile(sequence_length, tf.stack([attn_length])) weights = array_ops.reshape(weights, tf.shape(s)) a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights) # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1]) else: a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attn_weights, attns = attention(initial_state, use_attention=use_attention) #with variable_scope.variable_scope(scope or "Linear"): matrix = variable_scope.get_variable("Out_Matrix", [attn_size, output_size]) res = math_ops.matmul(attns[0], matrix) # NOTE: here we temporarily assume num_head = 1 bias_start = 0.0 bias_term = variable_scope.get_variable("Out_Bias", [output_size], initializer=init_ops.constant_initializer(bias_start)) output = res + bias_term return attention_states, attn_weights[0], attns[0], [output] # NOTE: here we temporarily assume num_head = 1
def _build(): return (array_ops.ones([2, 2], dtype=dtype), array_ops.zeros([3, 3], dtype=dtype))
def conjugate_gradient(operator, rhs, preconditioner=None, x=None, tol=1e-4, max_iter=20, name="conjugate_gradient"): r"""Conjugate gradient solver. Solves a linear system of equations `A*x = rhs` for selfadjoint, positive definite matrix `A` and righ-hand side vector `rhs`, using an iterative, matrix-free algorithm where the action of the matrix A is represented by `operator`. The iteration terminates when either the number of iterations exceeds `max_iter` or when the residual norm has been reduced to `tol` times its initial value, i.e. \\(||rhs - A x_k|| <= tol ||rhs||\\). Args: operator: An object representing a linear operator with attributes: - shape: Either a list of integers or a 1-D `Tensor` of type `int32` of length 2. `shape[0]` is the dimension on the domain of the operator, `shape[1]` is the dimension of the co-domain of the operator. On other words, if operator represents an N x N matrix A, `shape` must contain `[N, N]`. - dtype: The datatype of input to and output from `apply`. - apply: Callable object taking a vector `x` as input and returning a vector with the result of applying the operator to `x`, i.e. if `operator` represents matrix `A`, `apply` should return `A * x`. rhs: A rank-1 `Tensor` of shape `[N]` containing the right-hand size vector. preconditioner: An object representing a linear operator, see `operator` for detail. The preconditioner should approximate the inverse of `A`. An efficient preconditioner could dramatically improve the rate of convergence. If `preconditioner` represents matrix `M`(`M` approximates `A^{-1}`), the algorithm uses `preconditioner.apply(x)` to estimate `A^{-1}x`. For this to be useful, the cost of applying `M` should be much lower than computing `A^{-1}` directly. x: A rank-1 `Tensor` of shape `[N]` containing the initial guess for the solution. tol: A float scalar convergence tolerance. max_iter: An integer giving the maximum number of iterations. name: A name scope for the operation. Returns: output: A namedtuple representing the final state with fields: - i: A scalar `int32` `Tensor`. Number of iterations executed. - x: A rank-1 `Tensor` of shape `[N]` containing the computed solution. - r: A rank-1 `Tensor` of shape `[M]` containing the residual vector. - p: A rank-1 `Tensor` of shape `[N]`. `A`-conjugate basis vector. - gamma: \\(r \dot M \dot r\\), equivalent to \\(||r||_2^2\\) when `preconditioner=None`. """ # ephemeral class holding CG state. cg_state = collections.namedtuple("CGState", ["i", "x", "r", "p", "gamma"]) def stopping_criterion(i, state): return math_ops.logical_and(i < max_iter, linalg_ops.norm(state.r) > tol) def cg_step(i, state): # pylint: disable=missing-docstring z = operator.apply(state.p) alpha = state.gamma / util.dot(state.p, z) x = state.x + alpha * state.p r = state.r - alpha * z if preconditioner is None: gamma = util.dot(r, r) beta = gamma / state.gamma p = r + beta * state.p else: q = preconditioner.apply(r) gamma = util.dot(r, q) beta = gamma / state.gamma p = q + beta * state.p return i + 1, cg_state(i + 1, x, r, p, gamma) with ops.name_scope(name): n = operator.shape[1:] rhs = array_ops.expand_dims(rhs, -1) if x is None: x = array_ops.expand_dims( array_ops.zeros(n, dtype=rhs.dtype.base_dtype), -1) r0 = rhs else: x = array_ops.expand_dims(x, -1) r0 = rhs - operator.apply(x) if preconditioner is None: p0 = r0 else: p0 = preconditioner.apply(r0) gamma0 = util.dot(r0, p0) tol *= linalg_ops.norm(r0) i = constant_op.constant(0, dtype=dtypes.int32) state = cg_state(i=i, x=x, r=r0, p=p0, gamma=gamma0) _, state = control_flow_ops.while_loop(stopping_criterion, cg_step, [i, state]) return cg_state(state.i, x=array_ops.squeeze(state.x), r=array_ops.squeeze(state.r), p=array_ops.squeeze(state.p), gamma=state.gamma)
def raw_rnn(cell, loop_fn, parallel_iterations=None, swap_memory=False, scope=None): """ raw_rnn adapted from the original tensorflow implementation (https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/rnn.py) to emit arbitrarily nested states for each time step (concatenated along the time axis) in addition to the outputs at each timestep and the final state returns ( states for all timesteps, outputs for all timesteps, final cell state, ) """ # if not _like_rnncell(cell): #if not assert_like_rnncell("error", cell): # raise TypeError(f"cell must be an instance of RNNCell {type(cell)}") #if not callable(loop_fn): # raise TypeError("loop_fn must be a callable") parallel_iterations = parallel_iterations or 32 # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. with vs.variable_scope(scope or "rnn") as varscope: #if context.in_graph_mode(): # if varscope.caching_device is None: # varscope.set_caching_device(lambda op: op.device) time = constant_op.constant(0, dtype=dtypes.int32) (elements_finished, next_input, initial_state, emit_structure, init_loop_state) = loop_fn(time, None, None, None) flat_input = nest.flatten(next_input) # Need a surrogate loop state for the while_loop if none is available. loop_state = (init_loop_state if init_loop_state is not None else constant_op.constant(0, dtype=dtypes.int32)) input_shape = [input_.get_shape() for input_ in flat_input] static_batch_size = input_shape[0][0] for input_shape_i in input_shape: # Static verification that batch sizes all match if static_batch_size: static_batch_size.merge_with(input_shape_i[0]) else: static_batch_size = input_shape_i[0] batch_size = static_batch_size.value if static_batch_size else None const_batch_size = batch_size if batch_size is None: batch_size = array_ops.shape(flat_input[0])[0] nest.assert_same_structure(initial_state, cell.state_size) state = initial_state flat_state = nest.flatten(state) flat_state = [ops.convert_to_tensor(s) for s in flat_state] state = nest.pack_sequence_as(structure=state, flat_sequence=flat_state) if emit_structure is not None: flat_emit_structure = nest.flatten(emit_structure) flat_emit_size = [emit.shape if emit.shape.is_fully_defined() else array_ops.shape(emit) for emit in flat_emit_structure] flat_emit_dtypes = [emit.dtype for emit in flat_emit_structure] else: emit_structure = cell.output_size flat_emit_size = nest.flatten(emit_structure) flat_emit_dtypes = [flat_state[0].dtype] * len(flat_emit_size) flat_state_size = [s.shape if s.shape.is_fully_defined() else array_ops.shape(s) for s in flat_state] flat_state_dtypes = [s.dtype for s in flat_state] flat_emit_ta = [ tensor_array_ops.TensorArray( dtype=dtype_i, dynamic_size=True, element_shape=(tensor_shape.TensorShape([const_batch_size]) .concatenate(_maybe_tensor_shape_from_tensor(size_i))), size=0, name="rnn_output_%d" % i ) for i, (dtype_i, size_i) in enumerate(zip(flat_emit_dtypes, flat_emit_size)) ] emit_ta = nest.pack_sequence_as(structure=emit_structure, flat_sequence=flat_emit_ta) flat_zero_emit = [ array_ops.zeros(_concat(batch_size, size_i), dtype_i) for size_i, dtype_i in zip(flat_emit_size, flat_emit_dtypes)] zero_emit = nest.pack_sequence_as(structure=emit_structure, flat_sequence=flat_zero_emit) flat_state_ta = [ tensor_array_ops.TensorArray( dtype=dtype_i, dynamic_size=True, element_shape=(tensor_shape.TensorShape([const_batch_size]) .concatenate(_maybe_tensor_shape_from_tensor(size_i))), size=0, name="rnn_state_%d" % i ) for i, (dtype_i, size_i) in enumerate(zip(flat_state_dtypes, flat_state_size)) ] state_ta = nest.pack_sequence_as(structure=state, flat_sequence=flat_state_ta) def condition(unused_time, elements_finished, *_): return math_ops.logical_not(math_ops.reduce_all(elements_finished)) def body(time, elements_finished, current_input, state_ta, emit_ta, state, loop_state): (next_output, cell_state) = cell(current_input, state) nest.assert_same_structure(state, cell_state) nest.assert_same_structure(cell.output_size, next_output) next_time = time + 1 (next_finished, next_input, next_state, emit_output, next_loop_state) = loop_fn(next_time, next_output, cell_state, loop_state) nest.assert_same_structure(state, next_state) nest.assert_same_structure(current_input, next_input) nest.assert_same_structure(emit_ta, emit_output) # If loop_fn returns None for next_loop_state, just reuse the previous one. loop_state = loop_state if next_loop_state is None else next_loop_state def _copy_some_through(current, candidate): """Copy some tensors through via array_ops.where.""" def copy_fn(cur_i, cand_i): # TensorArray and scalar get passed through. if isinstance(cur_i, tensor_array_ops.TensorArray): return cand_i if cur_i.shape.ndims == 0: return cand_i # Otherwise propagate the old or the new value. with ops.colocate_with(cand_i): return array_ops.where(elements_finished, cur_i, cand_i) return nest.map_structure(copy_fn, current, candidate) emit_output = _copy_some_through(zero_emit, emit_output) next_state = _copy_some_through(state, next_state) emit_ta = nest.map_structure(lambda ta, emit: ta.write(time, emit), emit_ta, emit_output) state_ta = nest.map_structure(lambda ta, state: ta.write(time, state), state_ta, next_state) elements_finished = math_ops.logical_or(elements_finished, next_finished) return (next_time, elements_finished, next_input, state_ta, emit_ta, next_state, loop_state) returned = control_flow_ops.while_loop( condition, body, loop_vars=[ time, elements_finished, next_input, state_ta, emit_ta, state, loop_state], parallel_iterations=parallel_iterations, swap_memory=swap_memory ) (state_ta, emit_ta, final_state, final_loop_state) = returned[-4:] flat_states = nest.flatten(state_ta) flat_states = [array_ops.transpose(ta.stack(), (1, 0, 2)) for ta in flat_states] states = nest.pack_sequence_as(structure=state_ta, flat_sequence=flat_states) flat_outputs = nest.flatten(emit_ta) flat_outputs = [array_ops.transpose(ta.stack(), (1, 0, 2)) for ta in flat_outputs] outputs = nest.pack_sequence_as(structure=emit_ta, flat_sequence=flat_outputs) return (states, outputs, final_state)
def true_fn(): return [ constant_op.constant(1), TestTuple(constant_op.constant(2), [3, 4]), array_ops.zeros([5, 5]), 6 ]
def __init__(self): super(SimpleModelWithOneVariable, self).__init__() self.var = variables.Variable(array_ops.zeros((1, 10), name='var'))
def __call__(self, inputs, state, scope=None): # global h,e_ti,z_i,alpha_ti """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(1, 2, state) ## seperate inputs into word imbedding and image subfeatures shape = inputs.get_shape().as_list() # print("shape ") # print(shape) batch_size = shape[0] print("inputs.get_shape[0]") print(batch_size) hsize=h.get_shape() print("hidden state length") print(hsize[1].value) #padded_length = shape[1].value single_input_length = shape[1] # print("single_input_length ") # print(single_input_length) word_imbedding_length = 512 #subfeature_length = 192#(single_input_length-word_imbedding_length)/subfeature_num; subfeature_length = 768 # subfeature_num = int((single_input_length-word_imbedding_length)/subfeature_length) # subfeature_num = 35*35 subfeature_num = 17*17 #batch_size_ops # batch_size=32 tensorShape=tf.shape(inputs) #z_i = array_ops.zeros([batch_size,subfeature_length]) z_i=array_ops.zeros(tf.pack([tensorShape[0],subfeature_length])) print('inputs:') print(inputs) # print("Initial z_i:") # print(z_i) state_length = self._num_units # with vs.variable_scope(scope or type(self).__name__,initializer=self._initializer): #f_att_matrix = vs.get_variable(name="f_att_matrix",shape = (subfeature_length,state_length), initializer=tf.contrib.layers.xavier_initializer(),dtype=tf.float32) mid_layer_size = 300 W1 = vs.get_variable(name="w1",shape=(hsize[1].value+subfeature_length,mid_layer_size),initializer=tf.contrib.layers.xavier_initializer(),dtype=tf.float32) W2 = vs.get_variable(name="w2",shape=(mid_layer_size,1),initializer=tf.contrib.layers.xavier_initializer(),dtype=tf.float32) b1 = vs.get_variable(name="b1",shape=(1,mid_layer_size),initializer=tf.zeros_initializer,dtype=tf.float32) b2 = vs.get_variable(name="b2",shape=(1,1),initializer=tf.zeros_initializer,dtype=tf.float32) word_imbeddings=inputs[:,0:word_imbedding_length] alpha_ti = [] if single_input_length != word_imbedding_length: image_subfeatures=inputs[:,word_imbedding_length:single_input_length] #tf.summary.histogram("tensors/" + "subfeatures", image_subfeatures) #net2 = tf.reshape(net2, [shape2[0].value, -1, shape2[3].value]) #image_subfeatures=array_ops.reshape(image_subfeatures,[batch_size,subfeature_num,subfeature_length]) image_subfeatures=array_ops.reshape(image_subfeatures,tf.pack([tensorShape[0],subfeature_num,subfeature_length])) # f_att_matrix_exp=tf.expand_dims(f_att_matrix,0) # f_att_matrix_tile=tf.tile(f_att_matrix_exp,tf.pack([batch_size,1,1])) # print("fatt,fatt_exp,fatt_tile") # print(f_att_matrix) # print(f_att_matrix_exp) # print(f_att_matrix_tile) # tf.Print(f_att_matrix,[f_att_matrix]) # h=tf.expand_dims(h,2) # e_ti = math_ops.matmul(math_ops.matmul(tf.sigmoid(image_subfeatures),f_att_matrix_tile),h) # e_ti =array_ops.zeros([batch_size,subfeature_num]) W1_matrix=tf.expand_dims(W1,0) #[1,state_length+subfeature_length,mid_layer_size] W1_matrix=tf.tile(W1_matrix,tf.pack([tensorShape[0],1,1])) #[batchsize,state_length+subfeature_length,mid_layer_size] W2_matrix=tf.expand_dims(W2,0) W2_matrix=tf.tile(W2_matrix,tf.pack([tensorShape[0],1,1])) b1_matrix=tf.expand_dims(b1,0) #[1,1,mid_layer_size] b1_matrix=tf.tile(b1_matrix,tf.pack([tensorShape[0],1,1])) b2_matrix=tf.expand_dims(b2,0) #[1,1,mid_layer_size] b2_matrix=tf.tile(b2_matrix,tf.pack([tensorShape[0],1,1])) h_matrix=tf.expand_dims(h,1) # [batchsize,1,state_length] h_matrix=tf.tile(h_matrix,[1,subfeature_num,1]) #[batchsize,subfeature_num,state_length] x1 = tf.concat(2,[h_matrix,image_subfeatures]) #[batchsize,subfeature_num,state_length+subfeature_length] #x2 = tf.tanh(math_ops.matmul(x1,W1_matrix)+b1_matrix) #[batchsize,subfeature_num,mid_layer_size] x2 = tf.nn.relu(math_ops.matmul(x1,W1_matrix)+b1_matrix) #[batchsize,subfeature_num,mid_layer_size] #e_ti = tf.tanh(math_ops.matmul(x2,W2_matrix)+b2_matrix) #[batchsize,subfeature_num,1] e_ti = tf.nn.relu(math_ops.matmul(x2,W2_matrix)+b2_matrix) #[batchsize,subfeature_num,1] #e_ti = tf.squeeze(e_ti,[2]) #[batchsize,subfeature_num] alpha_ti = nn_ops.softmax(e_ti,dim=1) #[batchsize,subfeature_num,1] # e_ti=[] # for i in range(subfeature_num): # x1 = tf.concat(1,[h,image_subfeatures[:,i,:]]) # x2 = tf.tanh(math_ops.matmul(x1,W1)+b1) # x3 = tf.tanh(math_ops.matmul(x2,W2)+b2) # e_ti.append(x3) # # e_ti = self.f_att(image_subfeatures,subfeature_length,h,scope) # print("x1") # print(x1) # print("x2") # print(x2) # print("x3") # print(x3) # e_ti=tf.transpose(tf.pack(e_ti),[1,0,2]) # print("e_ti") # print(e_ti) # alpha_ti = nn_ops.softmax(e_ti) #tf.summary.histogram("tensors/" + "alpha_ti", alpha_ti) # z_i = math_ops.reduce_sum(math_ops.matmul(tf.transpose(image_subfeatures,[0,2,1]),alpha_ti),axis=1) z_i = math_ops.matmul(tf.transpose(image_subfeatures,[0,2,1]),alpha_ti) # h=tf.squeeze(h,[2]) z_i=tf.squeeze(z_i,squeeze_dims=[2]) print("squeezed z_i") print(z_i) #tf.summary.histogram("tensors/" + "z_i", z_i) #tf.summary.histogram("tensors/" + "h", h) concat = _linear([word_imbeddings, h, z_i], 4 * self._num_units, True) ### # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j)) new_h = self._activation(new_c) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat(1, [new_c, new_h]) return new_h, new_state, alpha_ti, z_i, word_imbeddings
def _rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None): """ Creates a recurrent neural network specified by RNNCell "cell". The simplest form of RNN network generated is: state = cell.zero_state(...) outputs = [] states = [] for input_ in inputs: output, state = cell(input_, state) outputs.append(output) states.append(state) return (outputs, states) However, a few other options are available: An initial state can be provided. If sequence_length is provided, dynamic calculation is performed. Dynamic calculation returns, at time t: (t >= max(sequence_length) ? (zeros(output_shape), zeros(state_shape)) : cell(input, state) Thus saving computational time when unrolling past the max sequence length. Arguments: cell: An instance of RNNCell. inputs: A length T list of inputs, each a tensor of shape [batch_size, cell.input_size]. initial_state: (optional) An initial state for the RNN. This must be a tensor of appropriate type and shape [batch_size x cell.state_size]. dtype: (optional) The data type for the initial state. Required if initial_state is not provided. sequence_length: An int64 vector (tensor) size [batch_size]. scope: VariableScope for the created subgraph; defaults to "RNN". Returns: A pair (outputs, states) where: outputs is a length T list of outputs (one for each input) states is a length T list of states (one state following each input) Raises: TypeError: If "cell" is not an instance of RNNCell. ValueError: If inputs is None or an empty list. """ if not isinstance(cell, RNNCell): raise TypeError("cell must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") outputs = [] states = [] batch_size = array_ops.shape(inputs[0])[0] if initial_state is not None: state = initial_state else: if not dtype: raise ValueError("If no initial_state is provided, dtype must be.") state = cell.zero_state(batch_size, dtype) if sequence_length: # Prepare variables zero_output_state = ( array_ops.zeros(array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype), array_ops.zeros(array_ops.pack([batch_size, cell.state_size]), state.dtype)) max_sequence_length = tf.reduce_max(sequence_length) for time, input_ in enumerate(inputs): def output_state(): return cell(input_, state, scope) if sequence_length: (output, state) = control_flow_ops.cond( time >= max_sequence_length, lambda: zero_output_state, output_state) else: (output, state) = output_state() outputs.append(output) states.append(state) return (outputs, states)
def _zeros_like(op_output): """Like array_ops.zeros_like() but also accepts resource var handles.""" if op_output.dtype == dtypes.resource: return array_ops.zeros( gen_resource_variable_ops.variable_shape(op_output)) return array_ops.zeros_like(op_output)
def _solve(self, rhs, adjoint=False, adjoint_arg=False): # Here we follow the same use of Roth's column lemma as in `matmul`, with # the key difference that we replace all `matmul` instances with `solve`. # This follows from the property that inv(A x B) = inv(A) x inv(B). # Below we document the shape manipulation for adjoint=False, # adjoint_arg=False, but the general case of different adjoints is still # handled. if adjoint_arg: rhs = linalg.adjoint(rhs) # Always add a batch dimension to enable broadcasting to work. batch_shape = array_ops.concat( [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0) rhs += array_ops.zeros(batch_shape, dtype=rhs.dtype.base_dtype) # rhs has shape [B, R, C], where B represent some number of batch # dimensions, # R represents the number of rows, and C represents the number of columns. # In order to apply Roth's column lemma, we need to operate on a batch of # column vectors, so we reshape into a batch of column vectors. We put it # at the front to ensure that broadcasting between operators to the batch # dimensions B still works. output = _rotate_last_dim(rhs, rotate_right=True) # Also expand the shape to be [A, C, B, R]. The first dimension will be # used to accumulate dimensions from each operator matmul. output = output[array_ops.newaxis, ...] # In this loop, A is going to refer to the value of the accumulated # dimension. A = 1 at the start, and will end up being self.range_dimension. # V will refer to the last dimension. V = R at the start, and will end up # being 1 in the end. for operator in self.operators[:-1]: # Reshape output from [A, C, B, V] to be # [A, C, B, V / op.domain_dimension, op.domain_dimension] if adjoint: operator_dimension = operator.range_dimension_tensor() else: operator_dimension = operator.domain_dimension_tensor() output = _unvec_by(output, operator_dimension) # We are computing (XA^-1^T) = (A^-1 X^T)^T. # output has [A, C, B, V / op.domain_dimension, op.domain_dimension], # which is being converted to: # [A, C, B, V / op.domain_dimension, op.range_dimension] output = array_ops.matrix_transpose(output) output = operator.solve(output, adjoint=adjoint, adjoint_arg=False) output = array_ops.matrix_transpose(output) # Rearrange it to [A * op.range_dimension, C, B, V / op.domain_dimension] output = _rotate_last_dim(output, rotate_right=False) output = _vec(output) output = _rotate_last_dim(output, rotate_right=True) # After the loop, we will have # A = self.range_dimension / op[-1].range_dimension # V = op[-1].domain_dimension # We convert that using matvec to get: # [A, C, B, op[-1].range_dimension] output = self.operators[-1].solvevec(output, adjoint=adjoint) # Rearrange shape to be [B1, ... Bn, self.range_dimension, C] output = _rotate_last_dim(output, rotate_right=False) output = _vec(output) output = _rotate_last_dim(output, rotate_right=False) if rhs.shape.is_fully_defined(): column_dim = rhs.shape[-1] broadcast_batch_shape = common_shapes.broadcast_shape( rhs.shape[:-2], self.batch_shape) if adjoint: matrix_dimensions = [self.domain_dimension, column_dim] else: matrix_dimensions = [self.range_dimension, column_dim] output.set_shape( broadcast_batch_shape.concatenate(matrix_dimensions)) return output
def _dynamic_rnn_loop(cell, inputs, initial_state, parallel_iterations, swap_memory, sequence_length=None): """Internal implementation of Dynamic RNN. Arguments: cell: An instance of RNNCell. inputs: A `Tensor` of shape [time, batch_size, depth]. initial_state: A `Tensor` of shape [batch_size, depth]. parallel_iterations: Positive Python int. swap_memory: A Python boolean sequence_length: (optional) An `int32` `Tensor` of shape [batch_size]. Returns: Tuple (final_outputs, final_state). final_outputs: A `Tensor` of shape [time, batch_size, depth]`. final_state: A `Tensor` of shape [batch_size, depth]. Raises: ValueError: If the input depth cannot be inferred via shape inference from the inputs. """ state = initial_state assert isinstance(parallel_iterations, int), "parallel_iterations must be int" # Construct an initial output input_shape = array_ops.shape(inputs) (time_steps, batch_size, _) = array_ops.unpack(input_shape, 3) inputs_got_shape = inputs.get_shape().with_rank(3) (const_time_steps, const_batch_size, const_depth) = inputs_got_shape.as_list() if const_depth is None: raise ValueError( "Input size (depth of inputs) must be accessible via shape inference, " "but saw value None.") # Prepare dynamic conditional copying of state & output zero_output = array_ops.zeros( array_ops.pack([batch_size, cell.output_size]), inputs.dtype) if sequence_length is not None: min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) time = array_ops.constant(0, dtype=tf.int32, name="time") with ops.op_scope([], "dynamic_rnn") as scope: base_name = scope output_ta = tensor_array_ops.TensorArray( dtype=inputs.dtype, size=time_steps, tensor_array_name=base_name + "output") input_ta = tensor_array_ops.TensorArray( dtype=inputs.dtype, size=time_steps, tensor_array_name=base_name + "input") input_ta = input_ta.unpack(inputs) def _time_step(time, state, output_ta_t): """Take a time step of the dynamic RNN. Args: time: int32 scalar Tensor. state: Vector. output_ta_t: `TensorArray`, the output with existing flow. Returns: The tuple (time + 1, new_state, output_ta_t with updated flow). """ input_t = input_ta.read(time) # Restore some shape information input_t.set_shape([const_batch_size, const_depth]) call_cell = lambda: cell(input_t, state) if sequence_length is not None: (output, new_state) = _rnn_step( time=time, sequence_length=sequence_length, min_sequence_length=min_sequence_length, max_sequence_length=max_sequence_length, zero_output=zero_output, state=state, call_cell=call_cell, skip_conditionals=True) else: (output, new_state) = call_cell() output_ta_t = output_ta_t.write(time, output) return (time + 1, new_state, output_ta_t) (_, final_state, output_final_ta) = control_flow_ops.while_loop( cond=lambda time, _1, _2: time < time_steps, body=_time_step, loop_vars=(time, state, output_ta), parallel_iterations=parallel_iterations, swap_memory=swap_memory) final_outputs = output_final_ta.pack() # Restore some shape information final_outputs.set_shape([ const_time_steps, const_batch_size, cell.output_size]) return final_outputs, final_state
def _matmul(self, x, adjoint=False, adjoint_arg=False): # Here we heavily rely on Roth's column Lemma [1]: # (A x B) * vec X = vec BXA^T, # where vec stacks all the columns of the matrix under each other. In our # case, x represents a batch of vec X (i.e. we think of x as a batch of # column vectors, rather than a matrix). Each member of the batch can be # reshaped to a matrix (hence we get a batch of matrices). # We can iteratively apply this lemma by noting that if B is a Kronecker # product, then we can apply the lemma again. # [1] W. E. Roth, "On direct product matrices," # Bulletin of the American Mathematical Society, vol. 40, pp. 461-468, # 1934 # Efficiency # Naively doing the Kronecker product, by calculating the dense matrix and # applying it will can take cubic time in the size of domain_dimension # (assuming a square matrix). The other issue is that calculating the dense # matrix can be prohibitively expensive, in that it can take a large amount # of memory. # # This implementation avoids this memory blow up by only computing matmuls # with the factors. In this way, we don't have to realize the dense matrix. # In terms of complexity, if we have Kronecker Factors of size: # (n1, n1), (n2, n2), (n3, n3), ... (nJ, nJ), with N = \prod n_i, and we # have as input a [N, M] matrix, the naive approach would take O(N^2 M). # With this approach (ignoring reshaping of tensors and transposes for now), # the time complexity can be O(M * (\sum n_i) * N). There is also the # benefit of batched multiplication (In this example, the batch size is # roughly M * N) so this can be much faster. However, not factored in are # the costs of the several transposing of tensors, which can affect cache # behavior. # Below we document the shape manipulation for adjoint=False, # adjoint_arg=False, but the general case of different adjoints is still # handled. if adjoint_arg: x = linalg.adjoint(x) # Always add a batch dimension to enable broadcasting to work. batch_shape = array_ops.concat( [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0) x += array_ops.zeros(batch_shape, dtype=x.dtype.base_dtype) # x has shape [B, R, C], where B represent some number of batch dimensions, # R represents the number of rows, and C represents the number of columns. # In order to apply Roth's column lemma, we need to operate on a batch of # column vectors, so we reshape into a batch of column vectors. We put it # at the front to ensure that broadcasting between operators to the batch # dimensions B still works. output = _rotate_last_dim(x, rotate_right=True) # Also expand the shape to be [A, C, B, R]. The first dimension will be # used to accumulate dimensions from each operator matmul. output = output[array_ops.newaxis, ...] # In this loop, A is going to refer to the value of the accumulated # dimension. A = 1 at the start, and will end up being self.range_dimension. # V will refer to the last dimension. V = R at the start, and will end up # being 1 in the end. for operator in self.operators[:-1]: # Reshape output from [A, C, B, V] to be # [A, C, B, V / op.domain_dimension, op.domain_dimension] if adjoint: operator_dimension = operator.range_dimension_tensor() else: operator_dimension = operator.domain_dimension_tensor() output = _unvec_by(output, operator_dimension) # We are computing (XA^T) = (AX^T)^T. # output has [A, C, B, V / op.domain_dimension, op.domain_dimension], # which is being converted to: # [A, C, B, V / op.domain_dimension, op.range_dimension] output = array_ops.matrix_transpose(output) output = operator.matmul(output, adjoint=adjoint, adjoint_arg=False) output = array_ops.matrix_transpose(output) # Rearrange it to [A * op.range_dimension, C, B, V / op.domain_dimension] output = _rotate_last_dim(output, rotate_right=False) output = _vec(output) output = _rotate_last_dim(output, rotate_right=True) # After the loop, we will have # A = self.range_dimension / op[-1].range_dimension # V = op[-1].domain_dimension # We convert that using matvec to get: # [A, C, B, op[-1].range_dimension] output = self.operators[-1].matvec(output, adjoint=adjoint) # Rearrange shape to be [B1, ... Bn, self.range_dimension, C] output = _rotate_last_dim(output, rotate_right=False) output = _vec(output) output = _rotate_last_dim(output, rotate_right=False) if x.shape.is_fully_defined(): column_dim = x.shape[-1] broadcast_batch_shape = common_shapes.broadcast_shape( x.shape[:-2], self.batch_shape) if adjoint: matrix_dimensions = [self.domain_dimension, column_dim] else: matrix_dimensions = [self.range_dimension, column_dim] output.set_shape( broadcast_batch_shape.concatenate(matrix_dimensions)) return output
def _testScopedExport(self, test_dir, exported_filenames): graph = ops.Graph() with graph.as_default(): # Creates an inference graph. # Hidden 1 colocate_constraint = constant_op.constant(1.2, name="constraint") images = constant_op.constant(1.2, dtypes.float32, shape=[100, 28], name="images") with ops.name_scope("hidden1"): with graph.colocate_with(colocate_constraint.op): weights1 = variables.Variable(random_ops.truncated_normal( [28, 128], stddev=1.0 / math.sqrt(float(28))), name="weights") # The use of control_flow_ops.cond here is purely for adding test # coverage the save and restore of control flow context (which doesn't # make any sense here from a machine learning perspective). The typical # biases is a simple Variable without the conditions. biases1 = variables.Variable(control_flow_ops.cond( math_ops.less(random.random(), 0.5), lambda: array_ops.ones([128]), lambda: array_ops.zeros([128])), name="biases") hidden1 = nn_ops.relu( math_ops.matmul(images, weights1) + biases1) # Hidden 2 with ops.name_scope("hidden2"): weights2 = variables.Variable(random_ops.truncated_normal( [128, 32], stddev=1.0 / math.sqrt(float(128))), name="weights") # The use of control_flow_ops.while_loop here is purely for adding test # coverage the save and restore of control flow context (which doesn't # make any sense here from a machine learning perspective). The typical # biases is a simple Variable without the conditions. def loop_cond(it, _): return it < 2 def loop_body(it, biases2): biases2 += constant_op.constant(0.1, shape=[32]) return it + 1, biases2 _, biases2 = control_flow_ops.while_loop( loop_cond, loop_body, [ constant_op.constant(0), variables.Variable(array_ops.zeros([32]), name="biases") ]) hidden2 = nn_ops.relu( math_ops.matmul(hidden1, weights2) + biases2) # Linear with ops.name_scope("softmax_linear"): weights3 = variables.Variable(random_ops.truncated_normal( [32, 10], stddev=1.0 / math.sqrt(float(32))), name="weights") biases3 = variables.Variable(array_ops.zeros([10]), name="biases") logits = math_ops.matmul(hidden2, weights3) + biases3 ops.add_to_collection("logits", logits) # Exports each sub-graph. # Exports the first one with unbound_inputs_col_name set to default. orig_meta_graph1, var_list = meta_graph.export_scoped_meta_graph( filename=os.path.join(test_dir, exported_filenames[0]), graph=ops.get_default_graph(), export_scope="hidden1") self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys())) var_names = [v.name for _, v in var_list.items()] self.assertEqual(["hidden1/biases:0", "hidden1/weights:0"], sorted(var_names)) # Exports the rest with no unbound_inputs_col_name. orig_meta_graph2, _ = meta_graph.export_scoped_meta_graph( filename=os.path.join(test_dir, exported_filenames[1]), graph=ops.get_default_graph(), export_scope="hidden2", unbound_inputs_col_name=None) orig_meta_graph3, _ = meta_graph.export_scoped_meta_graph( filename=os.path.join(test_dir, exported_filenames[2]), graph=ops.get_default_graph(), export_scope="softmax_linear", unbound_inputs_col_name=None) return [orig_meta_graph1, orig_meta_graph2, orig_meta_graph3]