def testBasicLSTMCell(self): for dtype in [dtypes.float16, dtypes.float32]: np_dtype = dtype.as_numpy_dtype with self.test_session(graph=ops.Graph()) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2], dtype=dtype) m = array_ops.zeros([1, 8], dtype=dtype) cell = rnn_cell_impl.MultiRNNCell( [ rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False) for _ in range(2) ], state_is_tuple=False) self.assertEqual(cell.dtype, None) g, out_m = cell(x, m) # Layer infers the input type. self.assertEqual(cell.dtype, dtype.name) expected_variable_names = [ "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME, "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME ] self.assertEqual(expected_variable_names, [v.name for v in cell.trainable_variables]) self.assertFalse(cell.non_trainable_variables) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g, out_m], { x.name: np.array([[1., 1.]]), m.name: 0.1 * np.ones([1, 8]) }) self.assertEqual(len(res), 2) variables = variables_lib.global_variables() self.assertEqual(expected_variable_names, [v.name for v in variables]) # The numbers in results were not calculated, this is just a # smoke test. self.assertAllClose(res[0], np.array( [[0.240, 0.240]], dtype=np_dtype), 1e-2) expected_mem = np.array( [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]], dtype=np_dtype) self.assertAllClose(res[1], expected_mem, 1e-2) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): # Test BasicLSTMCell with input_size != num_units. x = array_ops.zeros([1, 3], dtype=dtype) m = array_ops.zeros([1, 4], dtype=dtype) g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g, out_m], { x.name: np.array([[1., 1., 1.]], dtype=np_dtype), m.name: 0.1 * np.ones([1, 4], dtype=np_dtype) }) self.assertEqual(len(res), 2)
def __init__(self, max_id, shortlist_size=100, name_prefix=''): """Creates a new TopN.""" self.shortlist_size = shortlist_size # id_to_score contains all the scores we are tracking. self.id_to_score = variable_scope.get_variable( name=name_prefix + 'id_to_score', dtype=dtypes.float32, shape=[max_id], initializer=init_ops.constant_initializer(dtypes.float32.min)) # sl_ids and sl_scores together satisfy four invariants: # 1) If sl_ids[i] != -1, then # id_to_score[sl_ids[i]] = sl_scores[i] >= sl_scores[0] # 2) sl_ids[0] is the number of i > 0 for which sl_ids[i] != -1. # 3) If id_to_score[i] > sl_scores[0], then # sl_ids[j] = i for some j. # 4) If sl_ids[i] == -1, then sl_scores[i] = tf.float32.min. self.sl_ids = variable_scope.get_variable( name=name_prefix + 'shortlist_ids', dtype=dtypes.int64, shape=[shortlist_size + 1], initializer=init_ops.constant_initializer(-1)) # Ideally, we would set self.sl_ids[0] = 0 here. But then it is hard # to pass that control dependency to the other other Ops. Instead, we # have insert, remove and get_best all deal with the fact that # self.sl_ids[0] == -1 actually means the shortlist size is 0. self.sl_scores = variable_scope.get_variable( name=name_prefix + 'shortlist_scores', dtype=dtypes.float32, shape=[shortlist_size + 1], initializer=init_ops.constant_initializer(dtypes.float32.min)) # TopN keeps track of its internal data dependencies, so the user # doesn't have to. self.last_ops = []
def testTraining(self): """Tests a gradient descent step for a simple model.""" with self.test_session() as session: with self.test_scope(): with variable_scope.variable_scope("ascope", use_resource=True): w = variable_scope.get_variable( "w", shape=[4, 2], dtype=dtypes.float32, initializer=init_ops.constant_initializer( np.array([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=np.float32))) b = variable_scope.get_variable( "b", shape=[2], dtype=dtypes.float32, initializer=init_ops.constant_initializer( np.array([2, 3], dtype=np.float32))) x = array_ops.placeholder(dtypes.float32, shape=[1, 4]) y = math_ops.matmul(x, w) + b loss = math_ops.reduce_sum(y) optimizer = GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) session.run(variables.global_variables_initializer()) session.run(train, {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) vw, vb = session.run([w, b]) self.assertAllClose( np.array( [[0.3, 1.3], [2.7, 3.7], [4.5, 5.5], [6.1, 7.1]], dtype=np.float32), vw, rtol=1e-4) self.assertAllClose(np.array([1.9, 2.9], dtype=np.float32), vb, rtol=1e-4)
def _init_input2hidden(ops, rnn_mode, input_mode, W_init, input_dims, hidden_dims): # N represent the number of gates if 'rnn' in rnn_mode: N = 1 msg = '(W_hid)' elif rnn_mode == 'gru': N = 3 msg = '(W_input_to_updategate, W_input_to_resetgate, W_input_to_hiddenupdate)' elif rnn_mode == 'lstm': N = 4 msg = '(W_input_to_inputgate, W_input_to_forgetgate, W_input_to_hidden, W_input_to_outputgate)' # ====== check input ====== # if input_mode != 'skip': ops.get_variable_nnop(initializer=W_init, shape=(input_dims, hidden_dims * N), name='W_in', roles=Weight) if input_mode == 'norm': ops.get_variable_nnop(initializer=init_ops.constant_initializer(0.), shape=(hidden_dims * N,), name='beta', roles=BatchNormShiftParameter) ops.get_variable_nnop(initializer=init_ops.constant_initializer(1.), shape=(hidden_dims * N,), name='gamma', roles=BatchNormScaleParameter) ops.get_variable_nnop(initializer=init_ops.constant_initializer(0.), shape=(hidden_dims * N,), name='mean', roles=BatchNormPopulationMean) ops.get_variable_nnop(initializer=init_ops.constant_initializer(1.), shape=(hidden_dims * N,), name='inv_std', roles=BatchNormPopulationInvStd) # skip input mode elif input_dims != hidden_dims and \ input_dims != hidden_dims * N: # 3 gates + 1 hid_update raise Exception('Skip input mode, input trailing_dimension=%d ' '(the final dim) must equal to the number of hidden ' 'units (tied input connection), or %d-th the number ' 'of hidden units = %d, which include: ' + msg % (input_dims, N, hidden_dims * N))
def test_works_correctly_side_vars(self): with self.test_session() as sess: x_ = np.float32(2.1) # Adding extra tenth to force imprecision. y_ = np.float32(3.1) x = variable_scope.get_variable( name="x", shape=[], dtype=dtypes.float32, initializer=init_ops.constant_initializer(x_)) y = variable_scope.get_variable( name="y", shape=[], dtype=dtypes.float32, initializer=init_ops.constant_initializer(y_)) sess.run([variables.global_variables_initializer()]) f = lambda x: x * y g = lambda z: math_ops.square(x) * y fx = cg.custom_gradient(f(x), g(x), x) gx = gradients_impl.gradients(fx, variables.trainable_variables()) [x_, fx_, gx_] = sess.run([x, fx, gx[0]]) gy_ = gx[1] self.assertEqual(x_ * y_, fx_) self.assertEqual(np.square(x_) * y_, gx_) self.assertEqual(None, gy_)
def _TestOptimizerSupportHelper(self, opt): num_layers = 4 num_units = 2 batch_size = 8 direction = CUDNN_RNN_UNIDIRECTION dir_count = 1 with ops.Graph().as_default() as g: kernel_initializer = init_ops.constant_initializer(0.) bias_initializer = init_ops.constant_initializer(0.) inputs = random_ops.random_uniform([ num_layers * dir_count, batch_size, num_units], dtype=dtypes.float32) lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units, direction=direction, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name="awesome_lstm") outputs, _ = lstm(inputs) loss = math_ops.reduce_sum(outputs) optimizer = self._GetOptimizer(opt) train_op = optimizer.minimize(loss) with self.test_session(use_gpu=True, graph=g) as sess: sess.run(variables.global_variables_initializer()) sess.run(train_op)
def testGRUCell(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 2]) g, _ = rnn_cell_impl.GRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g], {x.name: np.array([[1., 1.]]), m.name: np.array([[0.1, 0.1]])}) # Smoke test self.assertAllClose(res[0], [[0.175991, 0.175991]]) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros( [1, 3]) # Test GRUCell with input_size != num_units. m = array_ops.zeros([1, 2]) g, _ = rnn_cell_impl.GRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g], {x.name: np.array([[1., 1., 1.]]), m.name: np.array([[0.1, 0.1]])}) # Smoke test self.assertAllClose(res[0], [[0.156736, 0.156736]])
def testInvalidGlobalStep(self): with ops.Graph().as_default() as g, self.test_session(graph=g): x = array_ops.placeholder(dtypes.float32, []) var = variable_scope.get_variable( "test", [], initializer=init_ops.constant_initializer(10)) loss = math_ops.abs(var * x) with self.assertRaises(AttributeError): optimizers_lib.optimize_loss( loss, global_step=constant_op.constant( 43, dtype=dtypes.int64), learning_rate=0.1, optimizer="SGD") with self.assertRaises(TypeError): optimizers_lib.optimize_loss( loss, global_step=variable_scope.get_variable( "global_step", [], trainable=False, dtype=dtypes.float64, initializer=init_ops.constant_initializer( 0.0, dtype=dtypes.float64)), learning_rate=0.1, optimizer="SGD") with self.assertRaises(ValueError): optimizers_lib.optimize_loss( loss, global_step=variable_scope.get_variable( "global_step", [1], trainable=False, dtype=dtypes.int64, initializer=init_ops.constant_initializer( [0], dtype=dtypes.int64)), learning_rate=0.1, optimizer="SGD")
def test_works_correctly_vector_of_vars(self): with self.test_session() as sess: x = variable_scope.get_variable( name="x", shape=[], dtype=dtypes.float32, initializer=init_ops.constant_initializer(2)) y = variable_scope.get_variable( name="y", shape=[], dtype=dtypes.float32, initializer=init_ops.constant_initializer(3)) sess.run([variables.global_variables_initializer()]) f = lambda z: z[0] * z[1] g = lambda z: z[0]**2 * z[1]**2 / 2 z = array_ops.stack([x, y]) fz = cg.custom_gradient(f(z), g(z), z, axis=0) gz = gradients_impl.gradients(fz, variables.trainable_variables()) [z_, fz_, gx_, gy_] = sess.run([z, fz, gz[0], gz[1]]) self.assertEqual(f(z_), fz_) self.assertEqual(g(z_), gx_) self.assertEqual(g(z_), gy_)
def test_works_correctly_fx_gx_manually_stopped(self): with self.test_session() as sess: x_ = np.float32(2.1) # Adding extra tenth to force imprecision. y_ = np.float32(3.1) x = variable_scope.get_variable( name="x", shape=[], dtype=dtypes.float32, initializer=init_ops.constant_initializer(x_)) y = variable_scope.get_variable( name="y", shape=[], dtype=dtypes.float32, initializer=init_ops.constant_initializer(y_)) sess.run([variables.global_variables_initializer()]) stop = array_ops.stop_gradient # For readability. # Basically we need to stop the `x` portion of `f`. And when we supply the # arg to `custom_gradient` we need to stop the complement, i.e., the `y` # part. f = lambda x: stop(x) * y g = lambda x: stop(math_ops.square(x)) * y fx = cg.custom_gradient(f(x), g(x), x + stop(y), fx_gx_manually_stopped=True) gx = gradients_impl.gradients(fx, variables.trainable_variables()) [x_, fx_, gx_, gy_] = sess.run([x, fx, gx[0], gx[1]]) self.assertEqual(x_ * y_, fx_) self.assertEqual(np.square(x_) * y_, gx_) self.assertEqual(x_, gy_)
def gru(cell_size, sequence_len, xs, name=None, scope=None): r"""gru args: sequence_len: a `tensor` of type `int64`. cell_size: an `int`. xs: a list of at least 1 `tensor` objects of type `float32`. name: a name for the operation (optional). returns: a tuple of `tensor` objects (rs, zs, rhs, gs, hs). rs: a list with the same number of `tensor` objects as `xs` of `tensor` objects of type `float32`. zs: a list with the same number of `tensor` objects as `xs` of `tensor` objects of type `float32`. rhs: a list with the same number of `tensor` objects as `xs` of `tensor` objects of type `float32`. gs: a list with the same number of `tensor` objects as `xs` of `tensor` objects of type `float32`. hs: a list with the same number of `tensor` objects as `xs` of `tensor` objects of type `float32`. """ with vs.variable_scope(scope or "Gru"): input_size = xs[0].get_shape()[1].value wxr = vs.get_variable("wxr", [input_size, cell_size]) whr = vs.get_variable("whr", [cell_size, cell_size]) wxz = vs.get_variable("wxz", [input_size, cell_size]) whz = vs.get_variable("whz", [cell_size, cell_size]) wxh = vs.get_variable("wxh", [input_size, cell_size]) whh = vs.get_variable("whh", [cell_size, cell_size]) br = vs.get_variable("br", [cell_size], initializer=init_ops.constant_initializer(1.0)) bz = vs.get_variable("bz", [cell_size], initializer=init_ops.constant_initializer(1.0)) bh = vs.get_variable("bh", [cell_size], initializer=init_ops.constant_initializer(0.0)) return gen_gru_ops._gru(cell_size=cell_size, sequence_len=sequence_len, wxr=wxr, whr=whr, wxz=wxz, whz=whz, wxh=wxh, whh=whh, br=br, bz=bz, bh=bh, xs=xs, name=name)
def gru_cell(cell_size, sequence_len, h_prev, x, name=None, scope=None, time_idx=None): r"""GRU Cell Args: sequence_len: A `Tensor` of type `int64`. h_prev: A `Tensor` of type `float32`. x: A `Tensor` of type `float32`. cell_size: An `int`. name: A name for the operation (optional). Returns: A tuple of `Tensor` objects (r, z, rh, g, h). r: A `Tensor` of type `float32`. z: A `Tensor` of type `float32`. rh: A `Tensor` of type `float32`. g: A `Tensor` of type `float32`. h: A `Tensor` of type `float32`. """ with vs.variable_scope(scope or "GruCell"): input_size = x.get_shape()[1].value wxr = vs.get_variable("wxr", [input_size, cell_size]) whr = vs.get_variable("whr", [cell_size, cell_size]) wxz = vs.get_variable("wxz", [input_size, cell_size]) whz = vs.get_variable("whz", [cell_size, cell_size]) wxh = vs.get_variable("wxh", [input_size, cell_size]) whh = vs.get_variable("whh", [cell_size, cell_size]) br = vs.get_variable("br", [cell_size], initializer=init_ops.constant_initializer(1.0)) bz = vs.get_variable("bz", [cell_size], initializer=init_ops.constant_initializer(1.0)) bh = vs.get_variable("bh", [cell_size], initializer=init_ops.constant_initializer(0.0)) return gen_gru_ops._gru_cell(cell_size=cell_size, sequence_len=sequence_len, wxr=wxr, whr=whr, wxz=wxz, whz=whz, wxh=wxh, whh=whh, br=br, bz=bz, bh=bh, h_prev=h_prev, x=x, name=name, time_idx=time_idx)
def testIndyGRUCell(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 2]) g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g], { x.name: np.array([[1., 1.]]), m.name: np.array([[0.1, 0.1]]) }) # Smoke test self.assertAllClose(res[0], [[0.185265, 0.17704]]) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): # Test IndyGRUCell with input_size != num_units. x = array_ops.zeros([1, 3]) m = array_ops.zeros([1, 2]) g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g], { x.name: np.array([[1., 1., 1.]]), m.name: np.array([[0.1, 0.1]]) }) # Smoke test self.assertAllClose(res[0], [[0.155127, 0.157328]])
def __call__(self, x, h_prev, scope=None): """GRU cell.""" with vs.variable_scope(scope or type(self).__name__): input_size = x.get_shape().with_rank(2)[1] # Check if the input size exist. if input_size is None: raise ValueError("Expecting input_size to be set.") # Check cell_size == state_size from h_prev. cell_size = h_prev.get_shape().with_rank(2)[1] if cell_size != self._cell_size: raise ValueError("Shape of h_prev[1] incorrect: cell_size %i vs %s" % (self._cell_size, cell_size)) if cell_size is None: raise ValueError("cell_size from `h_prev` should not be None.") w_ru = vs.get_variable("w_ru", [input_size + self._cell_size, self._cell_size * 2]) b_ru = vs.get_variable( "b_ru", [self._cell_size * 2], initializer=init_ops.constant_initializer(1.0)) w_c = vs.get_variable("w_c", [input_size + self._cell_size, self._cell_size]) b_c = vs.get_variable( "b_c", [self._cell_size], initializer=init_ops.constant_initializer(0.0)) _gru_block_cell = gen_gru_ops.gru_block_cell # pylint: disable=invalid-name _, _, _, new_h = _gru_block_cell( x=x, h_prev=h_prev, w_ru=w_ru, w_c=w_c, b_ru=b_ru, b_c=b_c) return new_h, new_h
def testRenorm(self): shape = (4, 3) xt = array_ops.placeholder(dtypes.float32, shape) momentum = 0.99 renorm_momentum = 0.8 rmax = 1.1 rmin = 0.9 dmax = 0.1 gamma = 2. beta = 3. epsilon = 0.001 bn = normalization_layers.BatchNormalization( axis=1, gamma_initializer=init_ops.constant_initializer(gamma), beta_initializer=init_ops.constant_initializer(beta), epsilon=epsilon, momentum=momentum, renorm=True, renorm_clipping={'rmax': rmax, 'rmin': rmin, 'dmax': dmax}, renorm_momentum=renorm_momentum) training = array_ops.placeholder(dtypes.bool) yt = bn.apply(xt, training=training) moving_mean = 0. moving_variance = 1. renorm_mean = renorm_stddev = 0. renorm_weight = 0. with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) for _ in range(5): x = np.random.random(shape) mean = x.mean(0) stddev = np.sqrt(x.var(0) + epsilon) adj_mean = renorm_mean + (1. - renorm_weight) * mean adj_stddev = renorm_stddev + (1. - renorm_weight) * stddev r = (stddev / adj_stddev).clip(rmin, rmax) d = ((mean - adj_mean) / adj_stddev).clip(-dmax, dmax) y_train = ((x - mean) / stddev * r + d) * gamma + beta renorm_mean += (mean - renorm_mean) * (1. - renorm_momentum) renorm_stddev += (stddev - renorm_stddev) * (1. - renorm_momentum) renorm_weight += (1. - renorm_weight) * (1. - renorm_momentum) moving_mean += (renorm_mean / renorm_weight - moving_mean) * (1. - momentum) moving_variance += ((renorm_stddev / renorm_weight) ** 2 - epsilon - moving_variance) * (1. - momentum) y_test = ((x - moving_mean) / (moving_variance + epsilon) ** 0.5 * gamma) + beta yt_val_train, _, _ = sess.run([yt] + bn.updates, feed_dict={xt: x, training: True}) yt_val_test, _, _ = sess.run([yt] + bn.updates, feed_dict={xt: x, training: False}) self.assertAllClose(y_train, yt_val_train, atol=1e-5) self.assertAllClose(y_test, yt_val_test, atol=1e-5)
def testInvalidValueTypeForConstantInitializerCausesTypeError(self): c = constant_op.constant([1.0, 2.0, 3.0]) with self.assertRaisesRegexp( TypeError, r"Invalid type for initial value: .*Tensor.*"): init_ops.constant_initializer(c, dtype=dtypes.float32) v = variables.Variable([3.0, 2.0, 1.0]) with self.assertRaisesRegexp( TypeError, r"Invalid type for initial value: .*Variable.*"): init_ops.constant_initializer(v, dtype=dtypes.float32)
def _norm(self, inp, scope): with vs.variable_scope(scope) as scope: shape = inp.get_shape()[-1:] gamma_init = init_ops.constant_initializer(self._g) beta_init = init_ops.constant_initializer(self._b) gamma = vs.get_variable("gamma", shape=shape, initializer=gamma_init) # pylint: disable=unused-variable beta = vs.get_variable("beta", shape=shape, initializer=beta_init) # pylint: disable=unused-variable normalized = layers.layer_norm(inp, reuse=True, scope=scope) return normalized
def _norm(self, inp, scope): shape = inp.get_shape()[-1:] gamma_init = init_ops.constant_initializer(self._g) beta_init = init_ops.constant_initializer(self._b) with vs.variable_scope(scope): # Initialize beta and gamma for use by layer_norm. vs.get_variable("gamma", shape=shape, initializer=gamma_init) vs.get_variable("beta", shape=shape, initializer=beta_init) normalized = layers.layer_norm(inp, reuse=True, scope=scope) return normalized
def testBasicLSTMCell(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 8]) cell = rnn_cell_impl.MultiRNNCell( [ rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False) for _ in range(2) ], state_is_tuple=False) g, out_m = cell(x, m) expected_variable_names = [ "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME, "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME ] self.assertEqual( expected_variable_names, [v.name for v in cell.trainable_variables]) self.assertFalse(cell.non_trainable_variables) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g, out_m], {x.name: np.array([[1., 1.]]), m.name: 0.1 * np.ones([1, 8])}) self.assertEqual(len(res), 2) variables = variables_lib.global_variables() self.assertEqual(expected_variable_names, [v.name for v in variables]) # The numbers in results were not calculated, this is just a smoke test. self.assertAllClose(res[0], [[0.24024698, 0.24024698]]) expected_mem = np.array([[ 0.68967271, 0.68967271, 0.44848421, 0.44848421, 0.39897051, 0.39897051, 0.24024698, 0.24024698 ]]) self.assertAllClose(res[1], expected_mem) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros( [1, 3]) # Test BasicLSTMCell with input_size != num_units. m = array_ops.zeros([1, 4]) g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g, out_m], {x.name: np.array([[1., 1., 1.]]), m.name: 0.1 * np.ones([1, 4])}) self.assertEqual(len(res), 2)
def _setup_model(): x = array_ops.placeholder(dtypes.float32, []) var = variable_scope.get_variable( "test", [], initializer=init_ops.constant_initializer(10)) loss = math_ops.abs(var * x) global_step = variable_scope.get_variable( "global_step", [], trainable=False, dtype=dtypes.int64, initializer=init_ops.constant_initializer( 0, dtype=dtypes.int64)) return x, var, loss, global_step
def testLayerBasic(self): num_layers = 4 num_units = 2 batch_size = 8 direction = CUDNN_RNN_UNIDIRECTION dir_count = 1 with vs.variable_scope("main"): kernel_initializer = init_ops.constant_initializer(0.) bias_initializer = init_ops.constant_initializer(0.) inputs = random_ops.random_uniform([ num_layers * dir_count, batch_size, num_units], dtype=dtypes.float32) lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units, direction=direction, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name="awesome_lstm") # Build the layer outputs1, _ = lstm(inputs) # Reuse the layer outputs2, _ = lstm(inputs) total_sum1 = math_ops.reduce_sum(outputs1) total_sum2 = math_ops.reduce_sum(outputs2) with vs.variable_scope("main", reuse=True): lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units, direction=direction, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name="awesome_lstm") # Reuse the layer outputs3, _ = lstm(inputs) total_sum3 = math_ops.reduce_sum(outputs3) self.assertEqual(1, len(variables.trainable_variables())) self.assertEqual(1, len(ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS))) self.assertEqual("main/awesome_lstm/opaque_kernel", variables.trainable_variables()[0].op.name) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) (total_sum1_v, total_sum2_v, total_sum3_v) = sess.run( [total_sum1, total_sum2, total_sum3]) self.assertEqual(0, total_sum1_v) self.assertEqual(0, total_sum2_v) self.assertEqual(0, total_sum3_v)
def testGhostBN4DimsAxis1(self): shape = [6, 3, 10, 10] num_virtual_batches = 3 beta = 2. gamma = 3. momentum = 0.8 epsilon = 1e-3 moving_means = np.zeros([1, 3, 3, 1, 1], dtype=np.float32) moving_vars = np.ones([1, 3, 3, 1, 1], dtype=np.float32) inp = array_ops.placeholder(dtypes.float32, shape) is_training = array_ops.placeholder(dtypes.bool) bn = normalization_layers.BatchNormalization( axis=1, momentum=momentum, epsilon=epsilon, beta_initializer=init_ops.constant_initializer(beta), gamma_initializer=init_ops.constant_initializer(gamma), num_virtual_batches=num_virtual_batches, fused=False) # NCHW is unsupported by CPU fused batch norm out = bn.apply(inp, training=is_training) ghost_shape = ([shape[0] // num_virtual_batches, num_virtual_batches] + shape[1:]) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) for _ in range(5): x = np.random.random(shape) sub_batched = np.reshape(x, ghost_shape) means = np.mean(sub_batched, axis=(0, 3, 4), keepdims=True) variances = np.var(sub_batched, axis=(0, 3, 4), keepdims=True) moving_means = moving_means * momentum + means * (1. - momentum) moving_vars = moving_vars * momentum + variances * (1. - momentum) y_train = ((sub_batched - means) / (variances + epsilon) ** 0.5 * gamma) + beta y_test = ((sub_batched - moving_means) / (moving_vars + epsilon) ** 0.5 * gamma) + beta y_train = np.reshape(y_train, shape) y_test = np.reshape(y_test, shape) y_val_train, _, _ = sess.run([out] + bn.updates, feed_dict={inp: x, is_training: True}) y_val_test = sess.run(out, feed_dict={inp: x, is_training: False}) self.assertAllClose(y_train, y_val_train, atol=1e-2) self.assertAllClose(y_test, y_val_test, atol=1e-2)
def batch_norm(x, deterministic, alpha=0.9, shift=True, scope='bn'): with vs.variable_scope(scope): dtype = x.dtype input_shape = x.get_shape().as_list() feat_dim = input_shape[-1] axes = range(len(input_shape)-1) if shift: beta = vs.get_variable( scope+"_beta", shape=[feat_dim], initializer=init_ops.zeros_initializer, dtype=dtype) else: beta = vs.get_variable( scope+"_beta", shape=[feat_dim], initializer=init_ops.zeros_initializer, dtype=dtype, trainable=False) gamma = vs.get_variable( scope+"_gamma", shape=[feat_dim], initializer=init_ops.constant_initializer(0.1), dtype=dtype) mean = vs.get_variable(scope+"_mean", shape=[feat_dim], initializer=init_ops.zeros_initializer, dtype=dtype, trainable=False) var = vs.get_variable(scope+"_var", shape=[feat_dim], initializer=init_ops.ones_initializer, dtype=dtype, trainable=False) counter = vs.get_variable(scope+"_counter", shape=[], initializer=init_ops.constant_initializer(0), dtype=tf.int64, trainable=False) zero_cnt = vs.get_variable(scope+"_zero_cnt", shape=[], initializer=init_ops.constant_initializer(0), dtype=tf.int64, trainable=False) batch_mean, batch_var = moments(x, axes, name=scope+'_moments') mean, var = cond(math_ops.equal(counter, zero_cnt), lambda: (batch_mean, batch_var), lambda: (mean, var)) mean, var, counter = cond(deterministic, lambda: (mean, var, counter), lambda: ((1-alpha) * batch_mean + alpha * mean, (1-alpha) * batch_var + alpha * var, counter + 1)) normed = batch_normalization(x, mean, var, beta, gamma, 1e-8) return normed
def testMultiRNNCellWithStateTuple(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m_bad = array_ops.zeros([1, 4]) m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])) # Test incorrectness of state with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"): core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(2) for _ in range(2)], state_is_tuple=True)(x, m_bad) _, ml = core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(2) for _ in range(2)], state_is_tuple=True)(x, m_good) sess.run([variables.global_variables_initializer()]) res = sess.run(ml, { x.name: np.array([[1., 1.]]), m_good[0].name: np.array([[0.1, 0.1]]), m_good[1].name: np.array([[0.1, 0.1]]) }) # The numbers in results were not calculated, this is just a # smoke test. However, these numbers should match those of # the test testMultiRNNCell. self.assertAllClose(res[0], [[0.175991, 0.175991]]) self.assertAllClose(res[1], [[0.13248, 0.13248]])
def _testDropoutWrapper(self, batch_size=None, time_steps=None, parallel_iterations=None, **kwargs): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): if batch_size is None and time_steps is None: # 2 time steps, batch size 1, depth 3 batch_size = 1 time_steps = 2 x = constant_op.constant( [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32) m = rnn_cell_impl.LSTMStateTuple( *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32) ] * 2) else: x = constant_op.constant( np.random.randn(time_steps, batch_size, 3).astype(np.float32)) m = rnn_cell_impl.LSTMStateTuple(*[ constant_op.constant( [[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32) ] * 2) outputs, final_state = rnn.dynamic_rnn( cell=rnn_cell_impl.DropoutWrapper( rnn_cell_impl.LSTMCell(3), dtype=x.dtype, **kwargs), time_major=True, parallel_iterations=parallel_iterations, inputs=x, initial_state=m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([outputs, final_state]) self.assertEqual(res[0].shape, (time_steps, batch_size, 3)) self.assertEqual(res[1].c.shape, (batch_size, 3)) self.assertEqual(res[1].h.shape, (batch_size, 3)) return res
def testCoupledInputForgetGateLSTMCell(self): with self.test_session() as sess: num_units = 2 state_size = num_units * 2 batch_size = 3 input_size = 4 expected_output = np.array( [[0.121753, 0.121753], [0.103349, 0.103349], [0.100178, 0.100178]], dtype=np.float32) expected_state = np.array( [[0.137523, 0.137523, 0.121753, 0.121753], [0.105450, 0.105450, 0.103349, 0.103349], [0.100742, 0.100742, 0.100178, 0.100178]], dtype=np.float32) with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([batch_size, input_size]) m = array_ops.zeros([batch_size, state_size]) output, state = rnn_cell.CoupledInputForgetGateLSTMCell( num_units=num_units, forget_bias=1.0)(x, m) sess.run([variables.global_variables_initializer()]) res = sess.run([output, state], { x.name: np.array([[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]]), m.name: 0.1 * np.ones((batch_size, state_size)) }) # This is a smoke test: Only making sure expected values didn't change. self.assertEqual(len(res), 2) self.assertAllClose(res[0], expected_output) self.assertAllClose(res[1], expected_state)
def call(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = inputs.dtype bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) # pylint: disable=protected-access value = math_ops.sigmoid( rnn_cell_impl._linear([inputs, state], 2 * self._num_units, True, bias_ones, self._kernel_initializer)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) # pylint: enable=protected-access with vs.variable_scope("candidate"): # pylint: disable=protected-access with vs.variable_scope("input_projection"): hi = rnn_cell_impl._linear(inputs, self._num_units, True, self._bias_initializer, self._kernel_initializer) with vs.variable_scope("hidden_projection"): hh = r * (rnn_cell_impl._linear(state, self._num_units, True, self._bias_initializer, self._kernel_initializer)) # pylint: enable=protected-access c = self._activation(hi + hh) new_h = u * state + (1 - u) * c return new_h, new_h
def testLSTMCell(self): with self.test_session() as sess: num_units = 8 num_proj = 6 state_size = num_units + num_proj batch_size = 3 input_size = 2 with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([batch_size, input_size]) m = array_ops.zeros([batch_size, state_size]) cell = rnn_cell_impl.LSTMCell( num_units=num_units, num_proj=num_proj, forget_bias=1.0, state_is_tuple=False) output, state = cell(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([output, state], { x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]), m.name: 0.1 * np.ones((batch_size, state_size)) }) self.assertEqual(len(res), 2) # The numbers in results were not calculated, this is mostly just a # smoke test. self.assertEqual(res[0].shape, (batch_size, num_proj)) self.assertEqual(res[1].shape, (batch_size, state_size)) # Different inputs so different outputs and states for i in range(1, batch_size): self.assertTrue( float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6) self.assertTrue( float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self): if not test.is_gpu_available(): # Can't perform this test w/o a GPU return with self.test_session(use_gpu=True) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 1, 3]) cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), "/gpu:0") with ops.device("/cpu:0"): outputs, _ = rnn.dynamic_rnn( cell=cell, inputs=x, dtype=dtypes.float32) run_metadata = config_pb2.RunMetadata() opts = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) sess.run([variables_lib.global_variables_initializer()]) _ = sess.run(outputs, options=opts, run_metadata=run_metadata) step_stats = run_metadata.step_stats ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1 gpu_stats = step_stats.dev_stats[ix].node_stats cpu_stats = step_stats.dev_stats[1 - ix].node_stats self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name]) self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
def testBasicLSTMCellWithDropout(self): def _is_close(x, y, digits=4): delta = x - y return delta < 10**(-digits) def _is_close_in(x, items, digits=4): for i in items: if _is_close(x, i, digits): return True return False keep_prob = 0.5 c_high = 2.9998924946 c_low = 0.999983298578 h_low = 0.761552567265 h_high = 0.995008519604 num_units = 5 allowed_low = [2, 3] with self.test_session() as sess: with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(1)): x = array_ops.zeros([1, 5]) c = array_ops.zeros([1, 5]) h = array_ops.zeros([1, 5]) state = core_rnn_cell_impl.LSTMStateTuple(c, h) cell = rnn_cell.LayerNormBasicLSTMCell( num_units, layer_norm=False, dropout_keep_prob=keep_prob) g, s = cell(x, state) sess.run([variables.global_variables_initializer()]) res = sess.run([g, s], { x.name: np.ones([1, 5]), c.name: np.ones([1, 5]), h.name: np.ones([1, 5]), }) # Since the returned tensors are of size [1,n] # get the first component right now. actual_h = res[0][0] actual_state_c = res[1].c[0] actual_state_h = res[1].h[0] # For each item in `c` (the cell inner state) check that # it is equal to one of the allowed values `c_high` (not # dropped out) or `c_low` (dropped out) and verify that the # corresponding item in `h` (the cell activation) is coherent. # Count the dropped activations and check that their number is # coherent with the dropout probability. dropped_count = 0 self.assertTrue((actual_h == actual_state_h).all()) for citem, hitem in zip(actual_state_c, actual_state_h): self.assertTrue(_is_close_in(citem, [c_low, c_high])) if _is_close(citem, c_low): self.assertTrue(_is_close(hitem, h_low)) dropped_count += 1 elif _is_close(citem, c_high): self.assertTrue(_is_close(hitem, h_high)) self.assertIn(dropped_count, allowed_low)
def optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None, colocate_gradients_with_ops=False, increment_global_step=True): """Given loss and parameters for optimizer, returns a training op. Various ways of passing optimizers include: - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES for full list. E.g. `optimize_loss(..., optimizer='Adam')`. - by function taking learning rate `Tensor` as argument and returning an `Optimizer` instance. E.g. `optimize_loss(..., optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`. Alternatively, if `learning_rate` is `None`, the function takes no arguments. E.g. `optimize_loss(..., learning_rate=None, optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`. - by a subclass of `Optimizer` having a single-argument constructor (the argument is the learning rate), such as AdamOptimizer or AdagradOptimizer. E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`. - by an instance of a subclass of `Optimizer`. E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`. Args: loss: Scalar `Tensor`. global_step: Scalar int `Tensor`, step counter to update on each step unless `increment_global_step` is `False`. If not supplied, it will be fetched from the default graph (see `tf.train.get_global_step` for details). If it has not been created, no step will be incremented with each weight update. `learning_rate_decay_fn` requires `global_step`. learning_rate: float or `Tensor`, magnitude of update per each training step. Can be `None`. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of `tf.Optimizer` that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantiation of `tf.Optimizer` sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. gradient_multipliers: dict of variables or variable names to floats. If present, gradients for specified variables will be multiplied by given constant. clip_gradients: float, callable or `None`. If float, is provided, a global clipping is applied to prevent the norm of the gradient to exceed this value. Alternatively, a callable can be provided e.g.: adaptive_clipping. This callable takes a `list` of `(gradients, variables)` `tuple`s and returns the same thing with the gradients modified. learning_rate_decay_fn: function, takes `learning_rate` and `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: `tf.train.exponential_decay`. Ignored if `learning_rate` is not supplied. update_ops: list of update `Operation`s to execute at each step. If `None`, uses elements of UPDATE_OPS collection. The order of execution between `update_ops` and `loss` is non-deterministic. variables: list of variables to optimize or `None` to use all trainable variables. name: The name for this operation is used to scope operations and summaries. summaries: List of internal quantities to visualize on tensorboard. If not set, the loss, the learning rate, and the global norm of the gradients will be reported. The complete list of possible values is in OPTIMIZER_SUMMARIES. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. increment_global_step: Whether to increment `global_step`. If your model calls `optimize_loss` multiple times per training step (e.g. to optimize different parts of the model), use this arg to avoid incrementing `global_step` more times than necessary. Returns: Training op. Raises: ValueError: if: * `loss` is an invalid type or shape. * `global_step` is an invalid type or shape. * `learning_rate` is an invalid type or value. * `optimizer` has the wrong type. * `clip_gradients` is neither float nor callable. * `learning_rate` and `learning_rate_decay_fn` are supplied, but no `global_step` is available. * `gradients` is empty. """ loss = ops.convert_to_tensor(loss) contrib_framework.assert_scalar(loss) if global_step is None: global_step = train.get_global_step() else: train.assert_global_step(global_step) with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): # Update ops take UPDATE_OPS collection if not provided. if update_ops is None: update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) # Make sure update ops are ran before computing loss. if update_ops: loss = control_flow_ops.with_dependencies(list(update_ops), loss) # Learning rate variable, with possible decay. lr = None if learning_rate is not None: if (isinstance(learning_rate, ops.Tensor) and learning_rate.get_shape().ndims == 0): lr = learning_rate elif isinstance(learning_rate, float): if learning_rate < 0.0: raise ValueError("Invalid learning_rate %s.", learning_rate) lr = vs.get_variable( "learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. " "Got %s of type %s" % (str(learning_rate), str(type(learning_rate)))) if summaries is None: summaries = ["loss", "learning_rate", "global_gradient_norm"] else: for summ in summaries: if summ not in OPTIMIZER_SUMMARIES: raise ValueError("Summaries should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_SUMMARIES), summ)) if learning_rate is not None and learning_rate_decay_fn is not None: if global_step is None: raise ValueError("global_step is required for learning_rate_decay_fn.") lr = learning_rate_decay_fn(lr, global_step) if "learning_rate" in summaries: summary.scalar("learning_rate", lr) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if lr is None: raise ValueError("Learning rate is None, but should be specified if " "optimizer is string (%s)." % optimizer) if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif (isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer)): if lr is None: raise ValueError("Learning rate is None, but should be specified if " "optimizer is class (%s)." % optimizer) opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer elif callable(optimizer): if learning_rate is not None: opt = optimizer(lr) else: opt = optimizer() if not isinstance(opt, optimizer_.Optimizer): raise ValueError("Unrecognized optimizer: function should return " "subclass of Optimizer. Got %s." % str(opt)) else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer, instance of " "subclass of Optimizer or function with one argument. " "Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients. gradients = opt.compute_gradients( loss, variables, colocate_gradients_with_ops=colocate_gradients_with_ops) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale) # Multiply some gradients. if gradient_multipliers is not None: gradients = _multiply_gradients(gradients, gradient_multipliers) if not gradients: raise ValueError( "Empty list of (gradient, var) pairs encountered. This is most " "likely to be caused by an improper value of gradient_multipliers.") if "global_gradient_norm" in summaries or "gradient_norm" in summaries: summary.scalar("global_norm/gradient_norm", clip_ops.global_norm(list(zip(*gradients))[0])) # Optionally clip gradients by global norm. if isinstance(clip_gradients, float): gradients = _clip_gradients_by_norm(gradients, clip_gradients) elif callable(clip_gradients): gradients = clip_gradients(gradients) elif clip_gradients is not None: raise ValueError( "Unknown type %s for clip_gradients" % type(clip_gradients)) # Add scalar summary for loss. if "loss" in summaries: summary.scalar("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: var_name = variable.name.replace(":", "_") if "gradients" in summaries: summary.histogram("gradients/%s" % var_name, grad_values) if "gradient_norm" in summaries: summary.scalar("gradient_norm/%s" % var_name, clip_ops.global_norm([grad_values])) if clip_gradients is not None and ("global_gradient_norm" in summaries or "gradient_norm" in summaries): summary.scalar("global_norm/clipped_gradient_norm", clip_ops.global_norm(list(zip(*gradients))[0])) # Create gradient updates. grad_updates = opt.apply_gradients( gradients, global_step=global_step if increment_global_step else None, name="train") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor
def optimize_loss(losses, global_step, learning_rate, optimizer, num_gpus=1, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of tf.Optimizer that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantion of tf.Optimizer sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. gradient_multipliers: dict of variables or variable names to floats. If present, gradients for specified variables will be multiplied by given constant. clip_gradients: float or `None`, clips gradients by this value. moving_average_decay: Deprecated. float or None, takes into account previous loss to make learning smoother due to outliers. learning_rate_decay_fn: function, takes `learning_rate` and `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: tf.train.exponential_decay. update_ops: list of update `Operation`s to execute at each step. If `None`, uses elements of UPDATE_OPS collection. variables: list of variables to optimize or `None` to use all trainable variables. name: The name for this operation is used to scope operations and summaries. summaries: List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ with vs.variable_scope(name, "OptimizeLoss", losses + [global_step]): # # Update ops take UPDATE_OPS collection if not provided. # if update_ops is None: # update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) # # Make sure update ops are ran before computing loss. # if update_ops: # #loss = control_flow_ops.with_dependencies(list(update_ops), loss) # raise ValueError('update ops not supported yet for multi gpu') # Learning rate variable, with possible decay. if (isinstance(learning_rate, ops.Tensor) and learning_rate.get_shape().ndims == 0): lr = learning_rate elif isinstance(learning_rate, float): lr = vs.get_variable( "learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. " "Got %s of type %s" % (str(learning_rate), str(type(learning_rate)))) if summaries is None: summaries = ["loss", "learning_rate"] if learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) if "learning_rate" in summaries: summary.scalar("learning_rate", lr) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif isinstance(optimizer, type) and issubclass( optimizer, optimizer_.Optimizer): opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer or instance of " "subclass of Optimizer. Got %s." % str(optimizer)) # Calculate the gradients for each model tower. tower_grads = [] for i in range(num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('tower', i)) as scope: # All trainable variables, if specific variables are not specified. #if variables is None: # variables = vars_.trainable_variables() # Compute gradients. loss = losses[i] #gradients = opt.compute_gradients(loss, variables) gradients = opt.compute_gradients(loss) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients( gradients, gradient_noise_scale) # Multiply some gradients. if gradient_multipliers is not None: gradients = _multiply_gradients( gradients, gradient_multipliers) # Optionally clip gradients by global norm. if clip_gradients is not None: gradients = _clip_gradients_by_norm( gradients, clip_gradients) tower_grads.append(gradients) # Add scalar summary for loss. if "loss" in summaries: summary.scalar("learning_rate", lr) #@TODO chg now just remove below TODO FIXME add gradient monitor ## Add histograms for variables, gradients and gradient norms. #for gradient, variable in gradients: # if isinstance(gradient, ops.IndexedSlices): # grad_values = gradient.values # else: # grad_values = gradient # if grad_values is not None: # if "gradients" in summaries: # logging_ops.histogram_summary(variable.name + "/gradients", # grad_values) # if "gradient_norm" in summaries: # logging_ops.histogram_summary(variable.name + "/gradient_norm", # clip_ops.global_norm([grad_values])) #if FLAGS.monitor_level > 1 and FLAGS.num_gpus == 0: # melt.monitor_gradients_from_loss(loss) gradients = average_gradients(tower_grads) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # # Make sure total_loss is valid. # final_loss = array_ops.check_numerics(loss, "Loss is inf or nan") # # Ensure the train_tensor computes grad_updates. # train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss) #return train_tensor return grad_updates
def _call_cell(self, inputs, initial_cell_state, initial_output, dtype, sequence_length): """Run this LSTM on inputs, starting from the given state. Args: inputs: `3-D` tensor with shape `[time_len, batch_size, input_size]` initial_cell_state: initial value for cell state, shape `[batch_size, self._num_units]` initial_output: initial value of cell output, shape `[batch_size, self._num_units]` dtype: The data type for the initial state and expected output. sequence_length: Specifies the length of each sequence in inputs. An `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0, time_len)` or None. Returns: A pair containing: - Cell state (cs): A `3-D` tensor of shape `[time_len, batch_size, output_size]` - Output (h): A `3-D` tensor of shape `[time_len, batch_size, output_size]` """ inputs_shape = inputs.get_shape().with_rank(3) time_len = inputs_shape[0].value if time_len is None: time_len = array_ops.shape(inputs)[0] input_size = inputs_shape[2].value w = vs.get_variable( "weights", [input_size + self._num_units, self._num_units * 4], dtype=dtype) b = vs.get_variable("biases", [w.get_shape().with_rank(2)[1]], initializer=init_ops.constant_initializer(0.0), dtype=dtype) if self._use_peephole: wci = vs.get_variable("w_i_diag", [self._num_units], dtype=dtype) wco = vs.get_variable("w_o_diag", [self._num_units], dtype=dtype) wcf = vs.get_variable("w_f_diag", [self._num_units], dtype=dtype) else: wci = wco = wcf = array_ops.zeros([self._num_units], dtype=dtype) if sequence_length is None: max_seq_len = time_len else: max_seq_len = math_ops.to_int64( math_ops.reduce_max(sequence_length)) _, cs, _, _, _, _, h = _lstm_ops_so.block_lstm( seq_len_max=max_seq_len, x=inputs, cs_prev=initial_cell_state, h_prev=initial_output, w=w, wci=wci, wco=wco, wcf=wcf, b=b, forget_bias=self._forget_bias, cell_clip=self._cell_clip, use_peephole=self._use_peephole) return cs, h
def __call__(self, input, state, scope, reuse=True): # the scope business gives a namespace to our weight variable matrix names with tf.variable_scope(scope,reuse=reuse): # input has shape [batch_size, input_size] # state has shape [batch_size, state_size] # We divide up a state vector as follows: # # h = (h0,r,w,M) # # where # # - h0 is the controller internal state (size controller_state_size) # - r is the read address weights (size memory_address_size) # - w is the write address weights (size memory_address_size) # - M is the memory state (size memory_address_size*memory_content_size) # # Viewed as a matrix of shape [mas,mcs] the rows of M index memory locations. # # NOTE: these vectors are all batched, so in the following h0 has shape # [batch_size, controller_state_size], for example. css = self._controller_state_size mas = self._memory_address_size mcs = self._memory_content_size powers = self._powers # the powers of the rotation matrix we allow h0, r, w, M = tf.split(state, [css, mas, mas, mas * mcs], 1) # Now generate the s, q, e, a vectors W_s = tf.get_variable("W_s", [css,len(powers)]) B_s = tf.get_variable("B_s", [len(powers)]) s = tf.nn.softmax(tf.matmul(h0,W_s) + B_s) # shape [batch_size,len(powers)] W_q = tf.get_variable("W_q", [css,len(powers)]) B_q = tf.get_variable("B_q", [len(powers)]) q = tf.nn.softmax(tf.matmul(h0,W_q) + B_q) # shape [batch_size,len(powers)] W_e = tf.get_variable("W_e", [css,mcs]) B_e = tf.get_variable("B_e", [mcs]) e = tf.nn.softmax(tf.matmul(h0,W_e) + B_e) # shape [batch_size,mcs] W_a = tf.get_variable("W_a", [css,mcs]) B_a = tf.get_variable("B_a", [mcs]) a = tf.nn.softmax(tf.matmul(h0,W_a) + B_a) # shape [batch_size,mcs] # Add and forget on the memory # TODO: not sure if matrix_diag is slow M = tf.reshape(M, [-1, mas, mcs]) erase_term = tf.matmul( M, tf.matrix_diag(e) ) # shape [batch_size, mas, mcs] add_term = tf.matmul( tf.reshape(w,[-1,mas,1]), tf.reshape(a,[-1,1,mcs]) ) # shape [batch_size, mas, mcs] M_new = M - erase_term + add_term M_new = tf.reshape(M_new, [-1, mas * mcs]) # Do the rotations of the read and write addresses # r has shape [batch_size,mas] Rtensor = rotation_tensor(mas,powers) # yields a tensor of shape [batch_size, mas, mas] # each row of which is \sum_i q_i R^i, and this batch # of matrices is then applied to r to generate r_new # NOTE: These are actually batch matmuls (tf.batch_matmul # went away with v1.0, matmul now does it automatically on the # first index) r_new = tf.matmul( tf.reshape(r, [-1,1,mas]), tf.tensordot( q, Rtensor, [[1], [0]] ) ) w_new = tf.matmul( tf.reshape(w, [-1,1,mas]), tf.tensordot( s, Rtensor, [[1], [0]] ) ) r_new = tf.reshape( r_new, [-1,mas] ) w_new = tf.reshape( w_new, [-1,mas] ) H = tf.get_variable("H", [css,css]) U = tf.get_variable("U", [self._input_size,css]) B = tf.get_variable("B", [css], initializer=init_ops.constant_initializer(0.0)) V = tf.get_variable("V", [mcs,css]) # converts from memory to controller state Mr = tf.matmul( M, tf.reshape(r,[-1,mas,1]), transpose_a=True ) Mr = tf.reshape( Mr, [-1,mcs] ) h0_new = self._activation(tf.matmul(h0, H) + tf.matmul(Mr,V) + tf.matmul(input,U) + B) state_new = tf.concat([h0_new, r_new, w_new, M_new], 1) return h0_new, state_new
def func(): embedding = tf.constant( np.ones([vocab_size, embedding_size], dtype=np.float32)) state_val = np.reshape([ np.ones([num_units], dtype=np.float32) * i for i in range(batch_size) ], [batch_size, num_units]) encoder_state = LSTMStateTuple(state_val, state_val) cell_initializer = init_ops.constant_initializer( np.array( [[ -0.9592235, 0.42451382, 0.7437744, -0.54485345, -0.80763197, 0.19663906, -0.22738314, 0.7762785, 0.7464578, 0.27227187, 0.7661047, 0.3596425, -0.8528242, -0.89316916, -0.48946142, 0.87882376 ], [ 0.86586094, -0.75018406, 0.25992537, -0.69368935, 0.2515502, -0.26379275, 0.8954313, 0.5759742, -0.7753072, -0.4388857, 0.95751476, -0.82085776, -0.9467752, -0.37055635, -0.18570113, -0.86504984 ], [ 0.02305841, 0.3850248, 0.893692, -0.6866486, -0.83703446, -0.9828961, 0.3989377, -0.59993076, 0.5330808, 0.6916566, 0.98468065, -0.6047034, 0.10823512, 0.34599304, -0.7834821, -0.7852347 ], [ 0.81643987, 0.31507468, -0.51369476, -0.12273741, 0.9701307, -0.79669356, -0.34496522, -0.88750815, -0.17995334, 0.34707904, -0.09201193, 0.5363934, -0.87229705, -0.5073328, -0.95894027, 0.5481839 ], [ -0.84093595, -0.2341497, -0.86047816, 0.43370056, -0.39073753, 0.37730122, 0.48026466, 0.3004985, -0.60727096, 0.9043884, -0.37619448, 0.22490788, -0.03739262, 0.61672115, 0.478899, -0.40780973 ], [ 0.31202435, -0.22045255, -0.6087918, 0.95115066, 0.00199413, -0.688287, -0.1103518, 0.4169519, 0.7913246, -0.9844644, -0.6193857, 0.38659644, -0.4726901, -0.44781208, -0.5174744, -0.605911 ], [ 0.66771054, 0.34912825, 0.22297978, -0.4990945, 0.24057317, -0.5540829, 0.92277217, 0.74939895, -0.35278273, -0.21587133, -0.28613377, -0.8794241, -0.40119147, 0.67175174, -0.22741508, 0.37898326 ]], dtype=np.float32)) dense_initializer = init_ops.constant_initializer( np.array([[ 0.56177187, -0.6233454, 0.73997784, 0.35032558, 0.6479795 ], [ 0.6831174, -0.34233975, 0.39330363, 0.45177555, -0.49649096 ], [ -0.98890066, 0.6175642, 0.09800482, -0.6721206, 0.48805737 ], [0.19671416, 0.2623148, 0.742548, 0.13555217, 0.56009054]], dtype=np.float32)) cell = LSTMCell(num_units=num_units, initializer=cell_initializer, state_is_tuple=True) helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=embedding, start_tokens=tf.tile([go_token], [batch_size]), end_token=end_token) output_layer = tf.layers.Dense( vocab_size, kernel_initializer=dense_initializer) decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell, helper=helper, initial_state=encoder_state, output_layer=output_layer) outputs, state, sequence_lengths = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, maximum_iterations=6) return tf.identity(outputs.rnn_output, name="rnn_output"), \ tf.identity(outputs.sample_id, name="sample_id"), \ tf.identity(state, name="state"), \ tf.identity(sequence_lengths, name="sequence_lengths")
def _conv(args, filter_size, output_channels, bias, inh_mult=1.5, exc_mult=3, bias_start=0.0, activation=None, initializers=None, dtype=tf.float32): """Convolution. Args: args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D, batch x n, Tensors. filter_size: int tuple of filter height and width. output_channels: int, number of convolutional kernels. bias: Whether to use biases in the convolution layer. bias_start: starting value to initialize the bias; 0 by default. Returns: A 3D, 4D, or 5D Tensor with shape [batch ... num_features] Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ def get_initializer(): initializer = tf.glorot_uniform_initializer( seed=None, dtype=tf.float32, ) return initializer # Calculate the total size of arguments on dimension 1. total_arg_size_depth = 0 shapes = [a.get_shape().as_list() for a in args] shape_length = len(shapes[0]) n_args = len(args) input, hidden = args if n_args > 2: raise ValueError("Expected only two " "arguments (input, hidden)") for shape in shapes: if len(shape) != 4: raise ValueError("Expected only 4-D arrays of " "form [n,h,w,c] for performing 2D convolutions") if len(shape) != len(shapes[0]): raise ValueError("Conv Linear expects all args " "to be of same Dimension: %s" % str(shapes)) # import ipdb; ipdb.set_trace() x_arg_depth = shapes[0][-1] h_arg_depth = shapes[1][-1] conv_op = nn_ops.conv2d strides = shape_length * [1] # TODO: Check extent of long-range inhibition f_h, f_w = filter_size f_h_inh, f_w_inh = int(f_h * inh_mult), int(f_w * inh_mult) f_h_exc, f_w_exc = int(f_h * exc_mult), int(f_w * exc_mult) filter_size_inh = [f_h_inh, f_w_inh] filter_size_exc = [f_h_exc, f_w_exc] # Build input and hidden kernels x_kernel = vs.get_variable("input_kernel", filter_size + [x_arg_depth, output_channels * 4], initializer=get_initializer(), dtype=tf.float32) # Build hidden state kernels h_kernel_gates = vs.get_variable("hidden_kernel_g", filter_size + [x_arg_depth, output_channels * 3], initializer=get_initializer(), dtype=tf.float32) # TODO: find optimal l1 strength h_kernel_inh = vs.get_variable( "hidden_kernel_inh", filter_size_inh + [x_arg_depth, output_channels], initializer=get_initializer(), regularizer=tf.contrib.layers.l1_regularizer(1e-2), dtype=tf.float32) h_kernel_exc = vs.get_variable( "hidden_kernel_exc", filter_size_exc + [x_arg_depth, output_channels], initializer=get_initializer(), regularizer=tf.contrib.layers.l1_regularizer(1e-2), dtype=tf.float32) res_x = conv_op(input, x_kernel, strides, padding="SAME") res_h_gates = conv_op(hidden, h_kernel_gates, strides, padding="SAME") res_h_inh = conv_op(hidden, h_kernel_inh, strides, padding="SAME") res_h_exc = conv_op(hidden, h_kernel_exc, strides, padding="SAME") if not bias: return res bias_input = vs.get_variable("biases_input", [output_channels * 4], dtype=tf.float32, initializer=init_ops.constant_initializer( bias_start, dtype=dtype)) bias_hidden_gates = vs.get_variable( "biases_hidden_g", [output_channels * 3], dtype=tf.float32, initializer=init_ops.constant_initializer(bias_start, dtype=dtype)) bias_hidden_exc = vs.get_variable( "biases_hidden_e", [output_channels], dtype=tf.float32, initializer=init_ops.constant_initializer(bias_start, dtype=dtype)) bias_hidden_inh = vs.get_variable( "biases_hidden_i", [output_channels], dtype=tf.float32, initializer=init_ops.constant_initializer(bias_start, dtype=dtype)) res_input = tf.math.add(res_x, bias_input, name='conv_input_gates') res_hidden_gates = tf.math.add(res_h_gates, bias_hidden_gates, name='conv_hidden_gates') res_hidden_inh = tf.math.add(res_h_inh, bias_hidden_inh, name='conv_hidden_inh') res_hidden_exc = tf.math.add(res_h_exc, bias_hidden_exc, name='conv_hidden_exc') return (res_input, res_hidden_gates, res_hidden_exc, res_hidden_inh)
def testBasicLSTMCell(self): for dtype in [dtypes.float16, dtypes.float32]: np_dtype = dtype.as_numpy_dtype with self.test_session(graph=ops.Graph()) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2], dtype=dtype) m = array_ops.zeros([1, 8], dtype=dtype) cell = rnn_cell_impl.MultiRNNCell( [ rnn_cell_impl.BasicLSTMCell( 2, state_is_tuple=False) for _ in range(2) ], state_is_tuple=False) self.assertEqual(cell.dtype, None) g, out_m = cell(x, m) # Layer infers the input type. self.assertEqual(cell.dtype, dtype.name) expected_variable_names = [ "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME, "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME ] self.assertEqual( expected_variable_names, [v.name for v in cell.trainable_variables]) self.assertFalse(cell.non_trainable_variables) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g, out_m], {x.name: np.array([[1., 1.]]), m.name: 0.1 * np.ones([1, 8])}) self.assertEqual(len(res), 2) variables = variables_lib.global_variables() self.assertEqual(expected_variable_names, [v.name for v in variables]) # The numbers in results were not calculated, this is just a # smoke test. self.assertAllClose( res[0], np.array([[0.240, 0.240]], dtype=np_dtype), 1e-2) expected_mem = np.array( [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]], dtype=np_dtype) self.assertAllClose(res[1], expected_mem, 1e-2) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): # Test BasicLSTMCell with input_size != num_units. x = array_ops.zeros([1, 3], dtype=dtype) m = array_ops.zeros([1, 4], dtype=dtype) g, out_m = rnn_cell_impl.BasicLSTMCell( 2, state_is_tuple=False)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g, out_m], {x.name: np.array([[1., 1., 1.]], dtype=np_dtype), m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)}) self.assertEqual(len(res), 2)
def alexnet_v2(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='alexnet_v2'): """AlexNet version 2. Described in: http://arxiv.org/pdf/1404.5997v2.pdf Parameters from: github.com/akrizhevsky/cuda-convnet2/blob/master/layers/ layers-imagenet-1gpu.cfg Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. To use in fully convolutional mode, set spatial_squeeze to false. The LRN layers have been removed and change the initializers from random_normal_initializer to xavier_initializer. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with variable_scope.variable_scope(scope, 'alexnet_v2', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with arg_scope( [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d], outputs_collections=[end_points_collection]): net = layers.conv2d( inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool1') net = layers.conv2d(net, 192, [5, 5], scope='conv2') net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool2') net = layers.conv2d(net, 384, [3, 3], scope='conv3') net = layers.conv2d(net, 384, [3, 3], scope='conv4') net = layers.conv2d(net, 256, [3, 3], scope='conv5') net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool5') # Use conv2d instead of fully_connected layers. with arg_scope( [layers.conv2d], weights_initializer=trunc_normal(0.005), biases_initializer=init_ops.constant_initializer(0.1)): net = layers.conv2d(net, 4096, [5, 5], padding='VALID', scope='fc6') net = layers_lib.dropout( net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = layers.conv2d(net, 4096, [1, 1], scope='fc7') net = layers_lib.dropout( net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = layers.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, biases_initializer=init_ops.zeros_initializer(), scope='fc8') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict(end_points_collection) if spatial_squeeze: net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points
def optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, moving_average_decay=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of tf.Optimizer that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantion of tf.Optimizer sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. gradient_multipliers: dict of variables or variable names to floats. If present, gradients for specified variables will be multiplied by given constant. clip_gradients: float or `None`, clips gradients by this value. moving_average_decay: Deprecated. float or None, takes into account previous loss to make learning smoother due to outliers. learning_rate_decay_fn: function, takes `learning_rate` and `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: tf.train.exponential_decay. update_ops: list of update `Operation`s to execute at each step. If `None`, uses elements of UPDATE_OPS collection. variables: list of variables to optimize or `None` to use all trainable variables. name: The name for this operation is used to scope operations and summaries. summaries: List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ with vs.variable_op_scope([loss, global_step], name, "OptimizeLoss"): # Update ops take UPDATE_OPS collection if not provided. if update_ops is None: update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) # Make sure update ops are ran before computing loss. if update_ops: with ops.control_dependencies(update_ops): barrier = control_flow_ops.no_op(name="update_barrier") loss = control_flow_ops.with_dependencies([barrier], loss) # Moving average of the loss with decay. # TODO(b/30439864): moving_average_decay should be removed. if moving_average_decay is not None: logging.warn("'moving_average_decay' is deprecated. Please use " "tensorboard's builtin averaging instead.") # Generate moving averages of the loss. loss_averages = train.ExponentialMovingAverage( moving_average_decay, name="avg") loss_averages_op = loss_averages.apply([loss]) logging_ops.scalar_summary("loss/mean", loss_averages.average(loss)) loss = control_flow_ops.with_dependencies([loss_averages_op], loss) # Learning rate variable, with possible decay. if (isinstance(learning_rate, ops.Tensor) and learning_rate.get_shape().ndims == 0): lr = learning_rate elif isinstance(learning_rate, float): lr = vs.get_variable( "learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. " "Got %s of type %s" % (str(learning_rate), str(type(learning_rate)))) if summaries is None: summaries = ["loss", "learning_rate"] if learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) if "learning_rate" in summaries: logging_ops.scalar_summary("learning_rate", lr) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif isinstance(optimizer, type) and issubclass( optimizer, optimizer_.Optimizer): opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer or instance of " "subclass of Optimizer. Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients. gradients = opt.compute_gradients(loss, variables) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale) # Multiply some gradients. if gradient_multipliers is not None: gradients = _multiply_gradients(gradients, gradient_multipliers) # Optionally clip gradients by global norm. if clip_gradients is not None: gradients = _clip_gradients_by_norm(gradients, clip_gradients) # Add scalar summary for loss. if "loss" in summaries: logging_ops.scalar_summary("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: if "gradients" in summaries: logging_ops.histogram_summary(variable.name + "/gradients", grad_values) if "gradient_norm" in summaries: logging_ops.histogram_summary( variable.name + "/gradient_norm", clip_ops.global_norm([grad_values])) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # Make sure total_loss is valid. final_loss = array_ops.check_numerics(loss, "Loss is inf or nan") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss) return train_tensor
def get_rnn_cell(cell_type, config): if (cell_type == 'rnn'): cell = tf.contrib.rnn.BasicRNNCell(config.layer_dim, dtype=config.dtype) elif (cell_type == 'multi_rnn'): cell = tf.nn.rnn_cell.MultiRNNCell([ tf.contrib.rnn.BasicRNNCell(config.layer_dim, dtype=config.dtype) for _ in range(4) ]) elif (cell_type == 'lstm'): cell = tf.contrib.rnn.BasicLSTMCell(config.layer_dim, dtype=config.dtype) elif (cell_type == 'multi_lstm'): cell = tf.nn.rnn_cell.MultiRNNCell([ tf.contrib.rnn.DropoutWrapper( tf.nn.rnn_cell.LSTMCell(config.layer_dim, dtype=config.dtype), output_keep_prob=config.dropout_keep_prob) for _ in range(2) ]) elif (cell_type == 'irnn'): cell = IRNNCell(config.layer_dim, dtype=config.dtype) elif (cell_type == 'multi_irnn'): cell = tf.nn.rnn_cell.MultiRNNCell( [IRNNCell(config.layer_dim, dtype=config.dtype) for _ in range(4)]) elif (cell_type == 'fast_weights'): cell = FastWeightCell(num_units=config.layer_dim, lam=config.fw_lambda, eta=config.fw_eta, layer_norm=config.fw_layer_norm, norm_gain=config.norm_gain, norm_shift=config.norm_shift, activation=config.fw_activation, dtype=config.dtype) elif (cell_type == 'multi_fw'): cell = tf.nn.rnn_cell.MultiRNNCell([ FastWeightCell(num_units=config.layer_dim, lam=config.fw_lambda, eta=config.fw_eta, layer_norm=config.fw_layer_norm, norm_gain=config.norm_gain, norm_shift=config.norm_shift, activation=tf.nn.relu, dtype=config.dtype, kernel_initializer=init_ops.constant_initializer( value=np.concatenate( (np.random.normal(loc=0.0, scale=0.001, size=(config.input_dim, config.layer_dim)), np.identity(config.layer_dim)), 0), dtype=config.dtype)) for _ in range(config.layers) ]) elif (cell_type == 'identity_fw'): cell = FastWeightCell( num_units=config.layer_dim, lam=config.fw_lambda, eta=config.fw_eta, layer_norm=config.fw_layer_norm, norm_gain=config.norm_gain, norm_shift=config.norm_shift, activation=tf.nn.tanh, dtype=config.dtype, kernel_initializer=init_ops.constant_initializer( value=np.concatenate((np.random.normal( loc=0.0, scale=0.001, size=(config.input_dim, config.layer_dim)), np.identity(config.layer_dim)), 0), dtype=config.dtype)) elif (cell_type == 'hybrid_front'): first_cell = FastWeightCell( num_units=config.layer_dim, lam=config.fw_lambda, eta=config.fw_eta, layer_norm=config.fw_layer_norm, norm_gain=config.norm_gain, norm_shift=config.norm_shift, activation=tf.nn.relu, dtype=config.dtype, kernel_initializer=init_ops.constant_initializer( value=np.concatenate((np.random.normal( loc=0.0, scale=0.001, size=(config.input_dim, config.layer_dim)), np.identity(config.layer_dim)), 0), dtype=config.dtype)) cell = tf.nn.rnn_cell.MultiRNNCell([ first_cell, IRNNCell(config.layer_dim), IRNNCell(config.layer_dim) ]) elif (cell_type == 'hybrid_back'): first_cell = FastWeightCell( num_units=config.layer_dim, lam=config.fw_lambda, eta=config.fw_eta, layer_norm=config.fw_layer_norm, norm_gain=config.norm_gain, norm_shift=config.norm_shift, activation=tf.nn.relu, dtype=config.dtype, kernel_initializer=init_ops.constant_initializer( value=np.concatenate((np.random.normal( loc=0.0, scale=0.001, size=(config.input_dim, config.layer_dim)), np.identity(config.layer_dim)), 0), dtype=config.dtype)) cell = tf.nn.rnn_cell.MultiRNNCell([ IRNNCell(config.layer_dim), IRNNCell(config.layer_dim), first_cell ]) elif (cell_type == 'dynamic_fast_weights'): cell = DynamicFastWeightCell(num_units=config.layer_dim, sequence_length=config.input_length, lam=config.fw_lambda, eta=config.fw_eta, layer_norm=config.fw_layer_norm, norm_gain=config.norm_gain, norm_shift=config.norm_shift, activation=config.fw_activation, batch_size=config.batchsize, num_inner_loops=config.fw_inner_loops, dtype=config.dtype) elif (cell_type == 'autoconceptor'): cell = Autoconceptor(num_units=config.layer_dim, alpha=config.c_alpha, lam=config.c_lambda, batchsize=config.batchsize, activation=config.c_activation, layer_norm=config.c_layer_norm, dtype=config.dtype) else: raise ValueError("Cell type not understood.") return cell
def fc(x, num_units_out): return layers.Dense( num_units_out, kernel_initializer=init_ops.constant_initializer(0.1), bias_initializer=init_ops.constant_initializer(0.0))(x)
def __init__(self, params, tree_num, training): self.tree = variable_scope.get_variable( name=self.get_tree_name('tree', tree_num), dtype=dtypes.int32, shape=[params.max_nodes, 2], initializer=init_ops.constant_initializer(-2)) self.tree_thresholds = variable_scope.get_variable( name=self.get_tree_name('tree_thresholds', tree_num), shape=[params.max_nodes], initializer=init_ops.constant_initializer(-1.0)) self.end_of_tree = variable_scope.get_variable( name=self.get_tree_name('end_of_tree', tree_num), dtype=dtypes.int32, initializer=constant_op.constant([1])) self.start_epoch = variable_scope.get_variable( name=self.get_tree_name('start_epoch', tree_num), dtype=dtypes.int32, shape=[params.max_nodes], initializer=init_ops.constant_initializer(0)) if training: self.node_to_accumulator_map = variable_scope.get_variable( name=self.get_tree_name('node_to_accumulator_map', tree_num), shape=[params.max_nodes], dtype=dtypes.int32, initializer=init_ops.constant_initializer(-1)) self.accumulator_to_node_map = variable_scope.get_variable( name=self.get_tree_name('accumulator_to_node_map', tree_num), shape=[params.max_fertile_nodes], dtype=dtypes.int32, initializer=init_ops.constant_initializer(-1)) self.candidate_split_features = variable_scope.get_variable( name=self.get_tree_name('candidate_split_features', tree_num), shape=[ params.max_fertile_nodes, params.num_splits_to_consider ], dtype=dtypes.int32, initializer=init_ops.constant_initializer(-1)) self.candidate_split_thresholds = variable_scope.get_variable( name=self.get_tree_name('candidate_split_thresholds', tree_num), shape=[ params.max_fertile_nodes, params.num_splits_to_consider ], initializer=init_ops.constant_initializer(0.0)) # Statistics shared by classification and regression. self.node_sums = variable_scope.get_variable( name=self.get_tree_name('node_sums', tree_num), shape=[params.max_nodes, params.num_output_columns], initializer=init_ops.constant_initializer(0.0)) if training: self.candidate_split_sums = variable_scope.get_variable( name=self.get_tree_name('candidate_split_sums', tree_num), shape=[ params.max_fertile_nodes, params.num_splits_to_consider, params.num_output_columns ], initializer=init_ops.constant_initializer(0.0)) self.accumulator_sums = variable_scope.get_variable( name=self.get_tree_name('accumulator_sums', tree_num), shape=[params.max_fertile_nodes, params.num_output_columns], initializer=init_ops.constant_initializer(-1.0)) # Regression also tracks second order stats. if params.regression: self.node_squares = variable_scope.get_variable( name=self.get_tree_name('node_squares', tree_num), shape=[params.max_nodes, params.num_output_columns], initializer=init_ops.constant_initializer(0.0)) self.candidate_split_squares = variable_scope.get_variable( name=self.get_tree_name('candidate_split_squares', tree_num), shape=[ params.max_fertile_nodes, params.num_splits_to_consider, params.num_output_columns ], initializer=init_ops.constant_initializer(0.0)) self.accumulator_squares = variable_scope.get_variable( name=self.get_tree_name('accumulator_squares', tree_num), shape=[ params.max_fertile_nodes, params.num_output_columns ], initializer=init_ops.constant_initializer(-1.0)) else: self.node_squares = constant_op.constant( 0.0, name=self.get_tree_name('node_squares', tree_num)) self.candidate_split_squares = constant_op.constant( 0.0, name=self.get_tree_name('candidate_split_squares', tree_num)) self.accumulator_squares = constant_op.constant( 0.0, name=self.get_tree_name('accumulator_squares', tree_num))
def _my_linear(args, output_size, cell_init="random", bias=True, bias_start=0.0, scope=None): """BK: added a new option "cell_init" to _linear. args: It is assumed to take the form of [inputs, state]. state can be r*state in case of GRU and h in case of BasicLSTM. output_size: It is usually the same as the state size, or n_units, but in case of LSTM it is 4*n_units. I will assume output_size is always an integer multiple of state size. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. arg_sizes = [] shapes = [a.get_shape().as_list() for a in args] for shape in shapes: if len(shape) != 2: raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes)) if not shape[1]: raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes)) else: arg_sizes += [shape[1]] dtype = [a.dtype for a in args][0] #Check if output_size is an integer multiple of state size. if (output_size % arg_sizes[1]) != 0: raise ValueError("output_size must be an integer multiple of n_units.") r = int( output_size / arg_sizes[1] ) #Even though both numerator and denominator are integers, their ratio is float type. So, we need to cast it to int. total_arg_size = sum(arg_sizes) # Now the computation. with vs.variable_scope(scope or "Linear"): if cell_init == "random": with vs.variable_scope(scope or "random"): matrix = vs.get_variable("Matrix", [total_arg_size, output_size], dtype=dtype) if len(args) == 1: res = math_ops.matmul(args[0], matrix) else: res = math_ops.matmul(array_ops.concat(1, args), matrix) if not bias: return res bias_term = vs.get_variable( "Bias", [output_size], dtype=dtype, initializer=init_ops.constant_initializer(bias_start, dtype=dtype)) return res + bias_term elif cell_init == "identity": with vs.variable_scope(scope or "identity"): #Below is an obsolete code that didn't concatenate matrices to make computations faster. # inputs_matrix = vs.get_variable("Inputs_Matrix", [arg_sizes[0], output_size], dtype=dtype) # if arg_sizes[1] == output_size: # init = tf.constant_initializer(np.identity(output_size)) # state_matrix = vs.get_variable("State_Matrix", [output_size, output_size], initializer=init, dtype=dtype) # else: # raise ValueError("state size and output size don't match.") # #Both inputs_result and state_results are tensors of shape (n_batch, output_size) # inputs_result = math_ops.matmul(args[0], inputs_matrix) # state_result = math_ops.matmul(args[1], state_matrix) # res = inputs_result + state_result list_id = [np.identity(arg_sizes[1]) for _ in range(r)] concat_id = np.concatenate(list_id, 1) # (arg_sizes[1], output_size) epsilon = np.sqrt(6.0 / (arg_sizes[0] + output_size)) xavier_part = np.random.uniform(-epsilon, epsilon, (arg_sizes[0], output_size)) total_init_matrix = np.concatenate( [xavier_part, concat_id], 0) # (arg_sizes[0]+arg_sizes[1], output_size) init = tf.constant_initializer(total_init_matrix) my_matrix = vs.get_variable("my_matrix", [total_arg_size, output_size], initializer=init, dtype=dtype) if len(args) == 1: res = math_ops.matmul(args[0], my_matrix) else: res = math_ops.matmul(array_ops.concat(1, args), my_matrix) if not bias: return res bias_term = vs.get_variable( "Bias", [output_size], dtype=dtype, initializer=init_ops.constant_initializer(bias_start, dtype=dtype)) return res + bias_term
def _line_sep(args, output_size, bias, bias_initializer=None, kernel_initializer=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError("linear expects shape[1] to \ be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: [x, h] = args x_size = x.get_shape().as_list()[1] W_xh = tf.get_variable( 'W_xh', [x_size, h_size * 4], initializer=weights_initializer ) W_ih = tf.get_variable( 'W_ih', [h_size, h_size], initializer=weights_initializer ) W_jh = tf.get_variable( 'W_jh', [h_size, h_size], initializer=weights_initializer ) W_fh = tf.get_variable( 'W_fh', [h_size, h_size], initializer=weights_initializer ) W_oh = tf.get_variable( 'W_oh', [h_size, h_size], initializer=weights_initializer ) xh = tf.matmul(x, W_xh) ih = tf.matmul(h, W_ih) + cn_xh[:, :h_size] jh = tf.matmul(h, W_jh) + cn_xh[:, h_size:h_size * 2] fh = tf.matmul(h, W_fh) + cn_xh[:, h_size * 2:h_size * 3] oh = tf.matmul(h, W_oh) + cn_xh[:, h_size * 3:] if not bias: return ih, jh, fh, oh with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer( 0.0, dtype=dtype) biases = vs.get_variable( _BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer) return nn_ops.bias_add(res, biases)
def LastValueQuantize(inputs, per_channel=False, init_min=-6.0, init_max=6.0, vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES, name_prefix='LastValueQuant', reuse=None, is_training=True, num_bits=8, narrow_range=False): """Adds a layer that collects quantization ranges as last input ranges. LastValueQuantize creates variables called 'min' and 'max', representing the interval used for quantization and clamping. Args: inputs: a tensor containing values to be quantized. per_channel: (Optional) a boolean specifying whether to use different quantization ranges per output channel. init_min: a float scalar, the initial value for variable min. init_max: a float scalar, the initial value for variable max. vars_collection: (Optional) collection where to store variables for quantization interval ends. name_prefix: name_prefix for created nodes. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. is_training: Whether the op is applied to a training or eval graph. num_bits: Number of bits to use for quantization, must be between 2 and 8. narrow_range: Whether to use the narrow quantization range [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1]. Returns: a tensor containing quantized values. """ with variable_scope.variable_scope(None, default_name=name_prefix, values=[inputs], reuse=reuse) as scope: scope.set_partitioner(None) input_shape = inputs.get_shape() input_dim = len(input_shape) if per_channel: # Only support quantizing 1-, 2- and 4-dimensional tensors. assert input_dim in [1, 2, 4 ], ('Expected 1D, 2D or 4D input, was: %s in ' ' scope: %s' % (input_shape, name_prefix)) min_max_shape = [input_shape[-1]] else: min_max_shape = [] min_var = model_variable( 'min', shape=min_max_shape, initializer=init_ops.constant_initializer(init_min), collections=[vars_collection], trainable=False) max_var = model_variable( 'max', shape=min_max_shape, initializer=init_ops.constant_initializer(init_max), collections=[vars_collection], trainable=False) if not is_training: return _FakeQuantWithMinMaxVars(inputs, min_var, max_var, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range) if per_channel: if input_dim == 2: reduce_dims = [0] elif input_dim == 4: reduce_dims = [0, 1, 2] if per_channel: if input_dim >= 2: batch_min = math_ops.reduce_min(inputs, reduction_indices=reduce_dims, name='BatchMin') else: batch_min = inputs else: batch_min = math_ops.reduce_min(inputs, name='BatchMin') # TFLite requires that 0.0 if always in the [min; max] range. batch_min = math_ops.minimum(batch_min, 0.0) assign_min = state_ops.assign(min_var, batch_min, name='AssignMinLast') if per_channel: if input_dim >= 2: batch_max = math_ops.reduce_max(inputs, reduction_indices=reduce_dims, name='BatchMax') else: batch_max = inputs else: batch_max = math_ops.reduce_max(inputs, name='BatchMax') # TFLite requires that 0.0 if always in the [min; max] range. batch_max = math_ops.maximum(batch_max, 0.0) assign_max = state_ops.assign(max_var, batch_max, name='AssignMaxLast') return _FakeQuantWithMinMaxVars(inputs, assign_min, assign_max, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range)
def __init__(self, filters, dau_units, max_kernel_size, strides=1, data_format='channels_first', activation=None, use_bias=True, weight_initializer=init_ops.random_normal_initializer(stddev=0.1), mu1_initializer=None, mu2_initializer=None, sigma_initializer=None, bias_initializer=init_ops.zeros_initializer(), weight_regularizer=None, mu1_regularizer=None, mu2_regularizer=None, sigma_regularizer=None, bias_regularizer=None, activity_regularizer=None, weight_constraint=None, mu1_constraint=None, mu2_constraint=None, sigma_constraint=None, bias_constraint=None, trainable=True, mu_learning_rate_factor=500, dau_unit_border_bound=0.01, dau_sigma_trainable=False, name=None, **kwargs): super(DAUConv2dTF, self).__init__(trainable=trainable, name=name, activity_regularizer=activity_regularizer, **kwargs) self.rank = 2 self.filters = filters self.dau_units = utils.normalize_tuple(dau_units, self.rank, 'dau_components') self.max_kernel_size = max_kernel_size self.padding = np.floor(self.max_kernel_size/2.0) self.strides = strides self.data_format = utils.normalize_data_format(data_format) self.activation = activation self.use_bias = use_bias self.bias_initializer = bias_initializer self.bias_regularizer = bias_regularizer self.bias_constraint = bias_constraint self.weight_initializer = weight_initializer self.weight_regularizer = weight_regularizer self.weight_constraint = weight_constraint self.mu1_initializer = mu1_initializer self.mu1_regularizer = mu1_regularizer self.mu1_constraint = mu1_constraint self.mu2_initializer = mu2_initializer self.mu2_regularizer = mu2_regularizer self.mu2_constraint = mu2_constraint self.sigma_initializer = sigma_initializer self.sigma_regularizer = sigma_regularizer self.sigma_constraint = sigma_constraint if self.mu1_initializer is None: raise Exception("Must initialize MU1") if self.mu2_initializer is None: raise Exception("Must initialize MU2") if self.sigma_initializer is None: self.sigma_initializer=init_ops.constant_initializer(0.5) self.mu_learning_rate_factor = mu_learning_rate_factor self.input_spec = base.InputSpec(ndim=self.rank + 2) self.dau_unit_border_bound = dau_unit_border_bound self.num_dau_units_all = np.int32(np.prod(self.dau_units)) self.dau_weights = None self.dau_mu1 = None self.dau_mu2 = None self.dau_sigma = None self.dau_sigma_trainable = dau_sigma_trainable
def initialize_graph(self, input_statistics=None): super(StubTimeSeriesModel, self).initialize_graph( input_statistics=input_statistics) self.prior_var = variable_scope.get_variable( "prior", [], initializer=init_ops.constant_initializer(0.))
def overfeat(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='overfeat'): """Contains the model definition for the OverFeat network. The definition for the network was obtained from: OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus and Yann LeCun, 2014 http://arxiv.org/abs/1312.6229 Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 231x231. To use in fully convolutional mode, set spatial_squeeze to false. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with variable_scope.variable_scope(scope, 'overfeat', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d with arg_scope( [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d], outputs_collections=end_points_collection): net = layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') net = layers_lib.max_pool2d(net, [2, 2], scope='pool1') net = layers.conv2d(net, 256, [5, 5], padding='VALID', scope='conv2') net = layers_lib.max_pool2d(net, [2, 2], scope='pool2') net = layers.conv2d(net, 512, [3, 3], scope='conv3') net = layers.conv2d(net, 1024, [3, 3], scope='conv4') net = layers.conv2d(net, 1024, [3, 3], scope='conv5') net = layers_lib.max_pool2d(net, [2, 2], scope='pool5') with arg_scope( [layers.conv2d], weights_initializer=trunc_normal(0.005), biases_initializer=init_ops.constant_initializer(0.1)): # Use conv2d instead of fully_connected layers. net = layers.conv2d(net, 3072, [6, 6], padding='VALID', scope='fc6') net = layers_lib.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = layers.conv2d(net, 4096, [1, 1], scope='fc7') net = layers_lib.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = layers.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, biases_initializer=init_ops.zeros_initializer(), scope='fc8') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict( end_points_collection) if spatial_squeeze: net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points
def testSequenceLoss(self): with self.test_session() as sess: with variable_scope.variable_scope( 'root', initializer=init_ops.constant_initializer(0.5)): batch_size = 2 sequence_length = 3 number_of_classes = 5 logits = [ constant_op.constant(i + 0.5, shape=[batch_size, number_of_classes]) for i in range(sequence_length) ] logits = array_ops.stack(logits, axis=1) targets = [ constant_op.constant(i, dtypes.int32, shape=[batch_size]) for i in range(sequence_length) ] targets = array_ops.stack(targets, axis=1) weights = [ constant_op.constant(1.0, shape=[batch_size]) for i in range(sequence_length) ] weights = array_ops.stack(weights, axis=1) average_loss_per_example = loss.sequence_loss( logits, targets, weights, average_across_timesteps=True, average_across_batch=True) res = sess.run(average_loss_per_example) self.assertAllClose(1.60944, res) average_loss_per_sequence = loss.sequence_loss( logits, targets, weights, average_across_timesteps=False, average_across_batch=True) res = sess.run(average_loss_per_sequence) compare_per_sequence = np.ones((sequence_length)) * 1.60944 self.assertAllClose(compare_per_sequence, res) average_loss_per_batch = loss.sequence_loss( logits, targets, weights, average_across_timesteps=True, average_across_batch=False) res = sess.run(average_loss_per_batch) compare_per_batch = np.ones((batch_size)) * 1.60944 self.assertAllClose(compare_per_batch, res) total_loss = loss.sequence_loss(logits, targets, weights, average_across_timesteps=False, average_across_batch=False) res = sess.run(total_loss) compare_total = np.ones( (batch_size, sequence_length)) * 1.60944 self.assertAllClose(compare_total, res)
def convolve_inputs(inputs, batch_size, height, width, channels, filters): W = get_variable('Weights', [1, 1, 1] + [channels, filters]) b = get_variable('Biases', [filters], initializer=constant_initializer(0.0)) y = conv3d(inputs, W, [1] * 5, 'SAME') + b return reshape(y, [batch_size, -1, height * width * filters])
def _InsertCalibOp(context, name, producer, consumers, vars_collection=ops.GraphKeys.GLOBAL_VARIABLES, producer_scope=None, consumer_scope=None): """Inserts calibration ops between a producer op and (multiple) consumer ops. Args: context: Context where producer and consumer operations are nested. name: Name for the new calibration op within the context. producer: Producer operation of the pairs where calibration will be inserted. consumers: Consumer operations of the pairs. producer_scope: The restriction of producer scope. If not None, the new op will be inserted only when the producer is in this scope. consumer_scope: The restriction of consumer scope. If not None, the new op will be inserted only when all the consumers are in this scope. Raises: ValueError: When producer operation is not directly connected to the consumer operation. """ if producer_scope and not producer.name.startswith(producer_scope): logging.info( '_InsertCalibOp ignores context="%s" name="%s" ' 'because producer "%s" is not in scope "%s"', context, name, producer.name, producer_scope) return if consumer_scope: consumers_in_scope = [] for consumer in consumers: if consumer.name.startswith(consumer_scope): consumers_in_scope.append(consumer) else: logging.info( '_InsertCalibOp context="%s" name="%s" ignores ' 'consumer "%s" because it is not in scope "%s"', context, name, consumer.name, consumer_scope) return consumers = consumers_in_scope name_prefix = _AddContextToName(context, name) name_scope = ops.get_name_scope() if name_scope: name_prefix = common.DropStringPrefix(name_prefix, name_scope + '/') inputs = producer.outputs[0] # Prevent ops from being modified multiple times. Bypass ops can sometimes # overlap between multiple matches, so we need to ensure that we don't # add duplicate calibration operations. #if _FollowedByFakeQuant(inputs): # return with variable_scope.variable_scope(None, default_name=name_prefix, values=[inputs]) as scope: # Currently no per channel. min_max_shape = [] vars_collections = [vars_collection] if vars_collection else [] min_var = _ModelVariable('min', shape=min_max_shape, initializer=init_ops.constant_initializer( float('inf')), collections=vars_collections, trainable=False) max_var = _ModelVariable( 'max', shape=min_max_shape, initializer=init_ops.constant_initializer(-float('inf')), collections=vars_collections, trainable=False) batch_min = math_ops.reduce_min(inputs, name='BatchMin') batch_max = math_ops.reduce_max(inputs, name='BatchMax') range_min = math_ops.minimum(batch_min, min_var, name=name_prefix + '/range_min') range_max = math_ops.maximum(batch_max, max_var, name=name_prefix + '/range_max') return range_min, range_max
def _batchnorm(self, input_x, scope, \ gamma_value, beta_value,\ moving_mean_value, moving_variance_value,\ is_training): """ Wrapper function for batch normalization. """ with variable_scope.variable_scope(scope): gamma_initial = init_ops.constant_initializer( gamma_value, dtypes.float32) gamma = gap_finetune.get_variable(\ name='gamma', shape=gamma_value.shape, dtype=dtypes.float32, initializer=gamma_initial, gap=self.gap, gap_vars=self.gap_vars) beta_initial = init_ops.constant_initializer( beta_value, dtypes.float32) beta = gap_finetune.get_variable(\ name='beta', shape=beta_value.shape, dtype=dtypes.float32, initializer=beta_initial, gap=self.gap, gap_vars=self.gap_vars) moving_mean_initial = init_ops.constant_initializer(\ moving_mean_value, dtypes.float32) moving_mean = gap_finetune.get_variable(\ name='moving_mean', shape=moving_mean_value.shape, dtype=dtypes.float32, initializer=moving_mean_initial, gap=self.gap, gap_vars=self.gap_vars) moving_variance_initial = init_ops.constant_initializer(\ moving_variance_value, dtypes.float32) moving_variance = gap_finetune.get_variable(\ name='moving_variance', shape=moving_variance_value.shape, dtype=dtypes.float32, initializer=moving_variance_initial, gap=self.gap, gap_vars=self.gap_vars) def mean_var_with_update(): mean, variance = nn_impl.moments(input_x, [0, 1, 2], name='moments') with ops.control_dependencies([\ moving_averages.assign_moving_average(\ moving_mean, mean, 0.9), moving_averages.assign_moving_average(\ moving_variance, variance, 0.9)]): return array_ops.identity(mean), array_ops.identity( variance) mean, variance = control_flow_ops.cond(is_training, \ mean_var_with_update, \ lambda: (moving_mean, moving_variance)) out = nn_impl.batch_normalization(input_x, mean, variance, beta, gamma, 0.001) return out
def _create_slots(self, var_list): for var in var_list: dtype = var.dtype.base_dtype init = init_ops.constant_initializer( self._initial_accumulator_value, dtype=dtype) self.add_slot(var, 'accumulator', init)
def alexnet_v2(inputs, is_training=True, dropout_keep_prob=0.5, scope='alexnet_v2'): """Modified version of AlexNet version 2 with a deconvolutional expanding path for semantic segmentation. Described in: http://arxiv.org/pdf/1404.5997v2.pdf Note: All the fully_connected layers have been transformed to conv2d layers. Args: inputs: a tensor of size [batch_size, 227, 227, 3]. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. scope: Optional scope for the variables. Returns: The last layer containing a segmentation map of an image. """ net = layers.conv2d(inputs, 96, [11, 11], 4, padding='VALID', scope='conv1') net = layers.conv2d(net, 192, 3, 2, padding='VALID', scope='pconv1') net = layers.conv2d(net, 192, [5, 5], padding='VALID', scope='conv2') net = layers.conv2d(net, 384, 3, 2, padding='VALID', scope='pconv2') net = layers.conv2d(net, 384, [3, 3], padding='VALID', scope='conv3') net = layers.conv2d(net, 384, [3, 3], padding='VALID', scope='conv4') net = layers.conv2d(net, 256, [3, 3], padding='VALID', scope='conv5') # Convolution net with arg_scope([layers.conv2d], weights_initializer=trunc_normal(0.005), biases_initializer=init_ops.constant_initializer(0.1)): net = layers.conv2d(net, 4096, [5, 5], padding='VALID', scope='fc6') net = layers_lib.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = layers.conv2d(net, 4096, [1, 1], scope='fc7') net = layers_lib.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = layers.conv2d( net, 2, [1, 1], # Prediction is either 'car' or 'background' for Carvana. padding='VALID', activation_fn=tf.nn.sigmoid, biases_initializer=init_ops.zeros_initializer(), scope='fc8') # Deconvolution net with arg_scope([layers.conv2d_transpose], padding='VALID', activation_fn=nn_ops.relu): net = layers.conv2d_transpose(net, 4096, 1, scope='convt9') net = layers.conv2d_transpose(net, 4096, 1, scope='convt10') net = layers.conv2d_transpose(net, 256, 5, scope='convt11') net = layers.conv2d_transpose(net, 384, 3, scope='convt12') net = layers.conv2d_transpose(net, 384, 3, scope='convt13') net = layers.conv2d_transpose(net, 384, 3, scope='convt14') net = layers.conv2d_transpose(net, 192, 3, 2, scope='convt15') net = layers.conv2d_transpose(net, 192, 5, scope='convt16') net = layers.conv2d_transpose(net, 96, 3, 2, scope='convt17') net = layers.conv2d_transpose(net, 2, 11, 4, activation_fn=tf.nn.sigmoid, scope='convt18') return net
def __init__(self, hparams, item, cluster, controller_id=0): """HierarchicalController class initializer. Args: hparams: All hyper-parameters. item: The metagraph to place. cluster: The cluster of hardware devices to optimize for. controller_id: the id of the controller in a multi-controller setup. """ super(HierarchicalController, self).__init__(item, cluster) self.ctrl_id = controller_id self.hparams = hparams if self.hparams.num_groups is None: self.num_groups = min(256, 20 * self.num_devices) else: self.num_groups = self.hparams.num_groups # creates self.op_embeddings and self.type_dict self.create_op_embeddings(verbose=False) # TODO(azalia) clean up embedding/group_embedding_size names self.group_emb_size = ( 2 * self.num_groups + len(self.type_dict) + self.hparams.max_num_outputs * self.hparams.max_output_size) self.embedding_size = self.group_emb_size self.initializer = init_ops.glorot_uniform_initializer( seed=self.hparams.seed) with variable_scope.variable_scope( self.hparams.name, initializer=self.initializer, reuse=variable_scope.AUTO_REUSE): # define parameters of feedforward variable_scope.get_variable("w_grouping_ff", [ 1 + self.hparams.max_num_outputs * self.hparams.max_output_size + self.hparams.adj_embed_dim, self.hparams.grouping_hidden_size ]) variable_scope.get_variable( "w_grouping_softmax", [self.hparams.grouping_hidden_size, self.num_groups]) if self.hparams.bi_lstm: variable_scope.get_variable("encoder_lstm_forward", [ self.embedding_size + self.hparams.hidden_size // 2, 2 * self.hparams.hidden_size ]) variable_scope.get_variable("encoder_lstm_backward", [ self.embedding_size + self.hparams.hidden_size // 2, 2 * self.hparams.hidden_size ]) variable_scope.get_variable( "device_embeddings", [self.num_devices, self.hparams.hidden_size]) variable_scope.get_variable( "decoder_lstm", [2 * self.hparams.hidden_size, 4 * self.hparams.hidden_size]) variable_scope.get_variable( "device_softmax", [2 * self.hparams.hidden_size, self.num_devices]) variable_scope.get_variable("device_go_embedding", [1, self.hparams.hidden_size]) variable_scope.get_variable( "encoder_forget_bias", shape=1, dtype=dtypes.float32, initializer=init_ops.constant_initializer( self.hparams.forget_bias_init)) variable_scope.get_variable( "decoder_forget_bias", shape=1, dtype=dtypes.float32, initializer=init_ops.constant_initializer( self.hparams.forget_bias_init)) variable_scope.get_variable( "attn_w_1", [self.hparams.hidden_size, self.hparams.hidden_size]) variable_scope.get_variable( "attn_w_2", [self.hparams.hidden_size, self.hparams.hidden_size]) variable_scope.get_variable("attn_v", [self.hparams.hidden_size, 1]) else: variable_scope.get_variable("encoder_lstm", [ self.embedding_size + self.hparams.hidden_size, 4 * self.hparams.hidden_size ]) variable_scope.get_variable( "device_embeddings", [self.num_devices, self.hparams.hidden_size]) variable_scope.get_variable( "decoder_lstm", [2 * self.hparams.hidden_size, 4 * self.hparams.hidden_size]) variable_scope.get_variable( "device_softmax", [2 * self.hparams.hidden_size, self.num_devices]) variable_scope.get_variable("device_go_embedding", [1, self.hparams.hidden_size]) variable_scope.get_variable( "encoder_forget_bias", shape=1, dtype=dtypes.float32, initializer=init_ops.constant_initializer( self.hparams.forget_bias_init)) variable_scope.get_variable( "decoder_forget_bias", shape=1, dtype=dtypes.float32, initializer=init_ops.constant_initializer( self.hparams.forget_bias_init)) variable_scope.get_variable( "attn_w_1", [self.hparams.hidden_size, self.hparams.hidden_size]) variable_scope.get_variable( "attn_w_2", [self.hparams.hidden_size, self.hparams.hidden_size]) variable_scope.get_variable("attn_v", [self.hparams.hidden_size, 1]) seq2seq_input_layer = array_ops.placeholder_with_default( array_ops.zeros([self.hparams.num_children, self.num_groups, self.group_emb_size], dtypes.float32), shape=(self.hparams.num_children, self.num_groups, self.group_emb_size)) self.seq2seq_input_layer = seq2seq_input_layer
def optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None, colocate_gradients_with_ops=False): """Given loss and parameters for optimizer, returns a training op. Various ways of passing optimizers, include: - string, name of the optimizer like 'SGD', 'Adam', see OPTIMIZER_CLS_NAMES for full list. E.g. `optimize_loss(..., optimizer='Adam')`. - function, takes learning rate `Tensor` as argument and must return `Optimizer` instance. E.g. `optimize_loss(..., optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`. Alternatively, if `learning_rate` is `None`, the function takes no arguments. E.g. `optimize_loss(..., learning_rate=None, optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`. - class, subclass of `Optimizer` that takes only one required argument - learning rate, such as AdamOptimizer, AdagradOptimizer. E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`. - object, instance of subclass of `Optimizer`. E.g., `optimizer_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of `tf.Optimizer` that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantiation of `tf.Optimizer` sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. gradient_multipliers: dict of variables or variable names to floats. If present, gradients for specified variables will be multiplied by given constant. clip_gradients: float or `None`, clips gradients by this value. learning_rate_decay_fn: function, takes `learning_rate` and `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: `tf.train.exponential_decay`. update_ops: list of update `Operation`s to execute at each step. If `None`, uses elements of UPDATE_OPS collection. The order of execution between `update_ops` and `loss` is non-deterministic. variables: list of variables to optimize or `None` to use all trainable variables. name: The name for this operation is used to scope operations and summaries. summaries: List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): # Update ops take UPDATE_OPS collection if not provided. if update_ops is None: update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) # Make sure update ops are ran before computing loss. if update_ops: loss = control_flow_ops.with_dependencies(list(update_ops), loss) # Learning rate variable, with possible decay. lr = None if learning_rate is not None: if (isinstance(learning_rate, ops.Tensor) and learning_rate.get_shape().ndims == 0): lr = learning_rate elif isinstance(learning_rate, float): lr = vs.get_variable( "learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError( "Learning rate should be 0d Tensor or float. " "Got %s of type %s" % (str(learning_rate), str(type(learning_rate)))) if summaries is None: summaries = ["loss", "learning_rate"] if learning_rate is not None and learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) if "learning_rate" in summaries: logging_ops.scalar_summary("learning_rate", lr) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if lr is None: raise ValueError( "Learning rate is None, but should be specified if " "optimizer is string (%s)." % optimizer) if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif (isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer)): if lr is None: raise ValueError( "Learning rate is None, but should be specified if " "optimizer is class (%s)." % optimizer) opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer elif callable(optimizer): if learning_rate is not None: opt = optimizer(lr) else: opt = optimizer() if not isinstance(opt, optimizer_.Optimizer): raise ValueError( "Unrecognized optimizer: function should return " "subclass of Optimizer. Got %s." % str(opt)) else: raise ValueError( "Unrecognized optimizer: should be string, " "subclass of Optimizer, instance of " "subclass of Optimizer or function with one argument. " "Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients. gradients = opt.compute_gradients( loss, variables, colocate_gradients_with_ops=colocate_gradients_with_ops) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale) # Multiply some gradients. if gradient_multipliers is not None: gradients = _multiply_gradients(gradients, gradient_multipliers) # Optionally clip gradients by global norm. if clip_gradients is not None: gradients = _clip_gradients_by_norm(gradients, clip_gradients) # Add scalar summary for loss. if "loss" in summaries: logging_ops.scalar_summary("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: if "gradients" in summaries: logging_ops.histogram_summary(variable.name + "/gradients", grad_values) if "gradient_norm" in summaries: logging_ops.histogram_summary( variable.name + "/gradient_norm", clip_ops.global_norm([grad_values])) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor
N.TimeDelayedConv(n_new_features=512, n_time_context=5), N.TimeDelayedConv(n_new_features=512, n_time_context=5), N.TimeDelayedConv(n_new_features=512, n_time_context=7), N.Dense(512), N.BatchNorm(), N.Dense(1500), N.BatchNorm(), N.StatsPool(axes=1, output_mode='concat'), N.Flatten(outdim=2), N.Dense(512, name="LatentOutput"), N.BatchNorm(), N.Dense(512), N.BatchNorm(), N.Dense(n_speakers, activation=K.linear, b_init=init_ops.constant_initializer(value=0)) ], debug=1) # ====== create outputs ====== # y_logit = x_vec(X) y_proba = tf.nn.softmax(y_logit) z = K.ComputationGraph(y_proba).get(roles=N.Dense, scope='LatentOutput', beginning_scope=False)[0] print('Latent space:', ctext(z, 'cyan')) # ====== create loss ====== # ce = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=y_logit) acc = K.metrics.categorical_accuracy(y_true=y, y_pred=y_proba) # ====== params and optimizing ====== # updates = K.optimizers.Adam(lr=0.0001, name='XAdam').minimize( loss=ce,
clockwork_mask = tf.constant(mask_tril, dtype=tf.float32, name="mask") #clockwork_mask = tf.constant(mask_triu, dtype=tf.float32, name="mask") # define paramaters with tf.variable_scope("input"): input_w = tf.get_variable("i_w", [n_input, n_hidden]) input_b = tf.get_variable("i_b", [n_hidden]) with tf.variable_scope("hidden"): hidden_w = tf.get_variable("h_w", [n_hidden, n_hidden]) hidden_b = tf.get_variable("h_b", [n_hidden]) with tf.variable_scope("bias_all"): bias_all = tf.get_variable("b_all", [n_hidden], initializer=init_ops.constant_initializer(0.0)) output_w = { 'out_w': tf.Variable(tf.random_normal([n_hidden, n_classes])) #'out_w': tf.get_variable("o_w",[n_hidden, n_classes]) } output_b = { 'out_b': tf.Variable(tf.random_normal([n_classes])) #'out_b': tf.get_variable("o_b",[n_classes]) } # Construct network #def RNN(x,state): def RNN(x, output_w, output_b): x = tf.transpose(x, [1, 0, 2])
def linear(args, output_size, bias, bias_initializer=None, kernel_initializer=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_initializer: starting value to initialize the bias (default is all zeros). kernel_initializer: starting value to initialize the weight. Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError( "linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: weights = vs.get_variable(_WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype, initializer=kernel_initializer) if len(args) == 1: res = math_ops.matmul(args[0], weights) else: res = math_ops.matmul(array_ops.concat(args, 1), weights) if not bias: return res with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype) biases = vs.get_variable(_BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer) return nn_ops.bias_add(res, biases)
def build(self, input_shape): """Create variables of the Cudnn RNN. It can be called manually before `__call__()` or automatically through `__call__()`. In the former case, subsequent `__call__()`s will skip creating variables. Args: input_shape: network input tensor shape, a python list or a TensorShape object with 3 dimensions. Raises: ValueError: if input_shape has wrong dimension or unknown 3rd dimension. """ if self.built: return input_shape = tensor_shape.TensorShape(input_shape) if input_shape.ndims != 3: raise ValueError("Expecting input_shape with 3 dims, got %d" % input_shape.ndims) if input_shape[-1].value is None: raise ValueError("The last dimension of the inputs to `CudnnRNN` " "should be defined. Found `None`.") self._input_size = input_shape[-1].value self.input_spec = base_layer.InputSpec(ndim=3, axes={-1: self._input_size}) self._set_scope(None) # Not using base class `add_variable()` since the it calls # `tf.get_variable()` with a callable initializer whereas here with a # tensor. The difference is mandated to support forward-compatibility with # Cudnn. with vs.variable_scope(self._scope, reuse=self.built, custom_getter=self._update_trainable_weights): if self._kernel_initializer is None: self._kernel_initializer = init_ops.glorot_uniform_initializer( seed=self._seed, dtype=self._plain_dtype) if self._bias_initializer is None: self._bias_initializer = init_ops.constant_initializer( 0.0, dtype=self._plain_dtype) weights = [ self._kernel_initializer(sp, dtype=self._plain_dtype) for sp in self.canonical_weight_shapes ] biases = [ self._bias_initializer(sp, dtype=self._plain_dtype) for sp in self.canonical_bias_shapes ] opaque_params_t = self._canonical_to_opaque(weights, biases) if vs.get_variable_scope().partitioner is not None: logging.warn( "Partitioner is not supported for Cudnn RNN layer variables, using " "it will create forward-compatibility issues with future " "CUDA/CuDNN generations.") # Initialize opaque params with a tensor. self.kernel = vs.get_variable("opaque_kernel", dtype=self._plain_dtype, initializer=opaque_params_t, validate_shape=False) # Create saveable in the outer scope of the cudnn subgraph, such that # alternative subgraph with platform-independent rnn cells can load the # checkpoints directly. if not (self.built or vs.get_variable_scope().reuse is True): self._create_saveable() self.built = True