def model_fn(): a = constant_op.constant([3.0, 5.0]) # The device scope is ignored for variables but not for normal ops. with ops.device('/job:worker/task:0'): x = variable_scope.get_variable( 'x', initializer=constant_op.constant([10.0, 20.0]), aggregation=variable_scope.VariableAggregation.SUM, partitioner=partitioner) x_add = x.assign_add(a, name='x_add') # The variable x is on the task 1 since the device_function has been # called once before the model_fn. for part_id, var in enumerate(x): self.assertEqual(var.device, '/job:ps/task:%d' % part_id) self.assertEqual(var.device, x_add[part_id].device) # The colocate_vars_with can override the distribution's device. with d.colocate_vars_with(x_add[0]): y = variable_scope.get_variable( 'y', initializer=constant_op.constant([20.0, 10.0]), aggregation=variable_scope.VariableAggregation.SUM, partitioner=partitioner) y_add = y.assign_add( [array_ops.identity(x_add[0]), array_ops.identity(x_add[1])]) for part_id, var in enumerate(y): self.assertEqual(var.device, '/job:ps/task:0') self.assertEqual(y_add[part_id].device, var.device) self.assertEqual(var.device, x_add[0].device) return x_add, y_add
def testInitFromCheckpoint(self): checkpoint_dir = self.get_temp_dir() with self.test_session() as session: v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir) # New graph and session. with ops.Graph().as_default() as g: with self.test_session(graph=g) as session: with variable_scope.variable_scope("some_scope"): my1 = variable_scope.get_variable("my1", [1, 10]) with variable_scope.variable_scope("some_other_scope"): my2 = variable_scope.get_variable("my2", [10, 10]) with variable_scope.variable_scope("other_useful_scope"): my4 = variable_scope.get_variable("var4", [9, 9]) my3 = variable_scope.get_variable("my3", [100, 100]) checkpoint_utils.init_from_checkpoint(checkpoint_dir, { "var1": "some_scope/my1", "useful_scope/": "some_scope/some_other_scope/other_useful_scope/", }) checkpoint_utils.init_from_checkpoint(checkpoint_dir, { "var2": "some_scope/some_other_scope/my2", "var3": my3, }) session.run(variables.global_variables_initializer()) self.assertAllEqual(my1.eval(session), v1) self.assertAllEqual(my2.eval(session), v2) self.assertAllEqual(my3.eval(session), v3) self.assertAllEqual(my4.eval(session), v4) # Check that tensors are not explicitly in the graph. self.assertLess(len(str(session.graph.as_graph_def())), 29000)
def _DenseLayer(x, num_inputs, num_outputs, quantization_range, name): """Dense layer with quantized outputs. Args: x: input to the dense layer num_inputs: number of input columns of x num_outputs: number of output columns quantization_range: the min/max range for quantization name: name of the variable scope Returns: The output of the layer. """ with variable_scope.variable_scope(name): kernel = variable_scope.get_variable( 'kernel', shape=[num_inputs, num_outputs], dtype=dtypes.float32, initializer=keras.initializers.glorot_uniform()) bias = variable_scope.get_variable( 'bias', shape=[num_outputs], dtype=dtypes.float32, initializer=keras.initializers.zeros()) x = math_ops.matmul(x, kernel) x = _Quantize(x, quantization_range) x = nn.bias_add(x, bias) x = _Quantize(x, quantization_range) return x
def register_option2quants(self, damping): self.register_cov_dt1() if damping not in self._option2quants_by_damping: # It's questionable as to whether we should initialize with stuff like # this at all. Ideally these values should never be used until they are # updated at least once. damping_string = scalar_or_tensor_to_string(damping) with variable_scope.variable_scope(self._var_scope): Pmat = variable_scope.get_variable( # pylint: disable=invalid-name "Lmat_damp{}".format(damping_string), initializer=inverse_initializer, shape=self._cov_shape, trainable=False, dtype=self._dtype) Kmat = variable_scope.get_variable( # pylint: disable=invalid-name "Kmat_damp{}".format(damping_string), initializer=inverse_initializer, shape=self._cov_shape, trainable=False, dtype=self._dtype) mu = variable_scope.get_variable( "mu_damp{}".format(damping_string), initializer=init_ops.ones_initializer, shape=self._vec_shape, trainable=False, dtype=self._dtype) self._option2quants_by_damping[damping] = (Pmat, Kmat, mu)
def _auc_hist_accumulate(hist_true, hist_false, nbins, collections): """Accumulate histograms in new variables.""" with variable_scope.variable_op_scope( [hist_true, hist_false], None, 'hist_accumulate'): # Holds running total histogram of scores for records labeled True. hist_true_acc = variable_scope.get_variable( 'hist_true_acc', initializer=array_ops.zeros_initializer( [nbins], dtype=hist_true.dtype), collections=collections, trainable=False) # Holds running total histogram of scores for records labeled False. hist_false_acc = variable_scope.get_variable( 'hist_false_acc', initializer=array_ops.zeros_initializer( [nbins], dtype=hist_false.dtype), collections=collections, trainable=False) update_op = control_flow_ops.group( hist_true_acc.assign_add(hist_true), hist_false_acc.assign_add(hist_false), name='update_op') return hist_true_acc, hist_false_acc, update_op
def _between_graph_with_monitored_session(self, strategy): context = distribute_coordinator_context.get_current_worker_context() self.assertTrue(context is not None) with ops.device("/job:ps/task:0"): # TODO(yuefengz): investigate why not using resource variable will make # the test flaky. x = variable_scope.get_variable("xx", initializer=10.0, use_resource=True) with ops.device("/job:ps/task:1"): y = variable_scope.get_variable("yy", initializer=20.0, use_resource=True) x_add = x.assign_add(2.0) y_sub = y.assign_sub(2.0) train_op = control_flow_ops.group([x_add, y_sub]) # The monitored session will run init or ready ops. with monitored_session.MonitoredSession() as sess: sess.run(train_op) # Synchronize workers after one step to make sure they all have finished # training. if context.has_barrier: context.wait_for_other_workers() else: self._barrier.wait() x_val, y_val = sess.run([x, y]) self.assertEqual(x_val, 16.0) self.assertEqual(y_val, 14.0) if x_val == 16.0 and y_val == 14.0: with self._lock: self._result_correct += 1
def __call__(self, x, states_prev, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or self._names["scope"]): x_shape = x.get_shape().with_rank(2) if not x_shape[1]: raise ValueError("Expecting x_shape[1] to be sets: %s" % str(x_shape)) if len(states_prev) != 2: raise ValueError("Expecting states_prev to be a tuple with length 2.") input_size = x_shape[1] w = vs.get_variable(self._names["W"], [input_size + self._num_units, self._num_units * 4]) b = vs.get_variable( self._names["b"], [w.get_shape().with_rank(2)[1]], initializer=init_ops.constant_initializer(0.0)) if self._use_peephole: wci = vs.get_variable(self._names["wci"], [self._num_units]) wco = vs.get_variable(self._names["wco"], [self._num_units]) wcf = vs.get_variable(self._names["wcf"], [self._num_units]) else: wci = wco = wcf = array_ops.zeros([self._num_units]) (cs_prev, h_prev) = states_prev (_, cs, _, _, _, _, h) = _lstm_block_cell( x, cs_prev, h_prev, w, b, wci=wci, wco=wco, wcf=wcf, forget_bias=self._forget_bias, use_peephole=self._use_peephole) return (h, (cs, h))
def testInvalidGlobalStep(self): with ops.Graph().as_default() as g, self.test_session(graph=g): x = array_ops.placeholder(dtypes.float32, []) var = variable_scope.get_variable( "test", [], initializer=init_ops.constant_initializer(10)) loss = math_ops.abs(var * x) with self.assertRaises(AttributeError): optimizers_lib.optimize_loss( loss, global_step=constant_op.constant( 43, dtype=dtypes.int64), learning_rate=0.1, optimizer="SGD") with self.assertRaises(TypeError): optimizers_lib.optimize_loss( loss, global_step=variable_scope.get_variable( "global_step", [], trainable=False, dtype=dtypes.float64, initializer=init_ops.constant_initializer( 0.0, dtype=dtypes.float64)), learning_rate=0.1, optimizer="SGD") with self.assertRaises(ValueError): optimizers_lib.optimize_loss( loss, global_step=variable_scope.get_variable( "global_step", [1], trainable=False, dtype=dtypes.int64, initializer=init_ops.constant_initializer( [0], dtype=dtypes.int64)), learning_rate=0.1, optimizer="SGD")
def build(self, input_shape): input_shape = tensor_shape.TensorShape(input_shape) if input_shape.ndims is None: raise ValueError('Inputs to `Dense` should have known rank.') if len(input_shape) < 2: raise ValueError('Inputs to `Dense` should have rank >= 2.') if input_shape[-1].value is None: raise ValueError('The last dimension of the inputs to `Dense` ' 'should be defined. Found `None`.') # Note that we set `trainable=True` because this is a trainable # weight of the layer. If the layer is not trainable # (self.trainable = False), the variable will not be added to # tf.trainable_variables(), and self.trainable_weights will be empty. self.kernel = vs.get_variable('kernel', shape=[input_shape[-1].value, self.units], initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, dtype=self.dtype, trainable=True) if self.use_bias: self.bias = vs.get_variable('bias', shape=[self.units,], initializer=self.bias_initializer, regularizer=self.bias_regularizer, dtype=self.dtype, trainable=True) else: self.bias = None
def testAllowsReuseWithoutPartitioner(self): with variable_scope.variable_scope( "scope0", partitioner=axis0_into2_partitioner): v = variable_scope.get_variable("name0", shape=(3, 1, 1)) with variable_scope.variable_scope("scope0", reuse=True): v_reused = variable_scope.get_variable("name0") self.assertEqual(v, v_reused)
def _testPartitionConcatenatesAlongCorrectAxis(self, use_resource): def _part_axis_0(**unused_kwargs): return (2, 1, 1) def _part_axis_1(**unused_kwargs): return (1, 2, 1) with variable_scope.variable_scope("root", use_resource=use_resource): v0 = variable_scope.get_variable( "n0", shape=(2, 2, 2), partitioner=_part_axis_0) v1 = variable_scope.get_variable( "n1", shape=(2, 2, 2), partitioner=_part_axis_1) self.assertEqual(v0.get_shape(), (2, 2, 2)) self.assertEqual(v1.get_shape(), (2, 2, 2)) n0_0 = list(v0)[0] n0_1 = list(v0)[1] self.assertEqual(n0_0.get_shape(), (1, 2, 2)) self.assertEqual(n0_1.get_shape(), (1, 2, 2)) n1_0 = list(v1)[0] n1_1 = list(v1)[1] self.assertEqual(n1_0.get_shape(), (2, 1, 2)) self.assertEqual(n1_1.get_shape(), (2, 1, 2))
def testInitFromNonInitializer(self): with self.test_session() as sess: # Test various dtypes with zeros initializer as following: types = [ dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.uint16, dtypes.int32, dtypes.int64, dtypes.bool ] # Use different varibale_name to distinguish various dtypes for (i, dtype) in enumerate(types): x = variable_scope.get_variable( name="x%d" % i, shape=(3, 4), dtype=dtype, partitioner=axis0_into2_partitioner) y = variable_scope.get_variable( name="y%d" % i, shape=(6, 4), dtype=dtype, partitioner=axis0_into2_partitioner, initializer=init_ops.zeros_initializer(dtype=dtype)) variables_lib.global_variables_initializer().run() # x and y would become var list after partition val_x = sess.run(list(x)) val_y = sess.run(list(y)) self.assertAllEqual(val_x, val_y)
def testReturnsExistingConcatenatedValueIfReuse(self): with variable_scope.variable_scope( "scope0", partitioner=axis0_into2_partitioner): v_concat = variable_scope.get_variable("name0", shape=(3, 1, 1)) variable_scope.get_variable_scope().reuse_variables() v_concat_2 = variable_scope.get_variable("name0", shape=(3, 1, 1)) self.assertEqual(v_concat, v_concat_2)
def testVarOpScopeReuseParam(self): with self.test_session(): with variable_scope.variable_scope("outer") as outer: with variable_scope.variable_scope("tower", "default", []): self.assertEqual( variable_scope.get_variable("w", []).name, "outer/tower/w:0") with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer/tower/scope2/") with variable_scope.variable_scope(None, "default", []): self.assertEqual( variable_scope.get_variable("w", []).name, "outer/default/w:0") with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer/default/scope2/") with variable_scope.variable_scope(outer) as outer: with variable_scope.variable_scope("tower", "default", reuse=True): self.assertEqual( variable_scope.get_variable("w", []).name, "outer/tower/w:0") with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer_1/tower/scope2/") outer.reuse_variables() with variable_scope.variable_scope(None, "default", []): self.assertEqual( variable_scope.get_variable("w", []).name, "outer/default/w:0") with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer_1/default/scope2/")
def testGetGlobalVariables(self): with self.test_session(): a = variable_scope.get_variable("a", []) with variable_scope.variable_scope("foo") as scope: b = variable_scope.get_variable("b", []) self.assertEqual([v.name for v in scope.global_variables()], ["foo/b:0"])
def _GenerateTestInputs(self): np.random.seed(0) weights = np.random.randn(self._num_classes, self._dim).astype(np.float32) biases = np.random.randn(self._num_classes).astype(np.float32) hidden_acts = np.random.randn(self._batch_size, self._dim).astype(np.float32) with ops.Graph().as_default() as g: sharded_weights = variable_scope.get_variable( "w", partitioner=partitioned_variables.fixed_size_partitioner( self._num_shards), initializer=constant_op.constant(weights)) sharded_biases = variable_scope.get_variable( "b", partitioner=partitioned_variables.fixed_size_partitioner( self._num_shards), initializer=constant_op.constant(biases)) with self.test_session(graph=g) as sess: variables.global_variables_initializer().run() sharded_weights_v, sharded_biases_v = sess.run( [list(sharded_weights), list(sharded_biases)]) return weights, biases, hidden_acts, sharded_weights_v, sharded_biases_v
def testVarOpScope(self): with self.test_session(): with ops.name_scope("scope1"): with variable_scope.variable_scope("tower", "default", []): self.assertEqual( variable_scope.get_variable("w", []).name, "tower/w:0") with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope1/tower/scope2/") with variable_scope.variable_scope("tower", "default", []): with self.assertRaises(ValueError): variable_scope.get_variable("w", []) with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope1/tower_1/scope2/") with ops.name_scope("scope2"): with variable_scope.variable_scope(None, "default", []): self.assertEqual( variable_scope.get_variable("w", []).name, "default/w:0") with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope2/default/scope2/") with variable_scope.variable_scope(None, "default", []): self.assertEqual( variable_scope.get_variable("w", []).name, "default_1/w:0") with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope2/default_1/scope2/")
def testTraining(self): """Tests a gradient descent step for a simple model.""" with self.test_session() as session: with self.test_scope(): with variable_scope.variable_scope("ascope", use_resource=True): w = variable_scope.get_variable( "w", shape=[4, 2], dtype=dtypes.float32, initializer=init_ops.constant_initializer( np.array([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=np.float32))) b = variable_scope.get_variable( "b", shape=[2], dtype=dtypes.float32, initializer=init_ops.constant_initializer( np.array([2, 3], dtype=np.float32))) x = array_ops.placeholder(dtypes.float32, shape=[1, 4]) y = math_ops.matmul(x, w) + b loss = math_ops.reduce_sum(y) optimizer = GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) session.run(variables.global_variables_initializer()) session.run(train, {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) vw, vb = session.run([w, b]) self.assertAllClose( np.array( [[0.3, 1.3], [2.7, 3.7], [4.5, 5.5], [6.1, 7.1]], dtype=np.float32), vw, rtol=1e-4) self.assertAllClose(np.array([1.9, 2.9], dtype=np.float32), vb, rtol=1e-4)
def testRegisterSingleParamRegisteredInTuple(self): x = variable_scope.get_variable('x', initializer=array_ops.constant(1,)) y = variable_scope.get_variable('y', initializer=array_ops.constant(1,)) lc = layer_collection.LayerCollection() lc.fisher_blocks = {(x, y): '1'} lc.register_block(x, 'foo') self.assertEqual(set(['1']), set(lc.get_blocks()))
def __call__(self, x, h_prev, scope=None): """GRU cell.""" with vs.variable_scope(scope or type(self).__name__): input_size = x.get_shape().with_rank(2)[1] # Check if the input size exist. if input_size is None: raise ValueError("Expecting input_size to be set.") # Check cell_size == state_size from h_prev. cell_size = h_prev.get_shape().with_rank(2)[1] if cell_size != self._cell_size: raise ValueError("Shape of h_prev[1] incorrect: cell_size %i vs %s" % (self._cell_size, cell_size)) if cell_size is None: raise ValueError("cell_size from `h_prev` should not be None.") w_ru = vs.get_variable("w_ru", [input_size + self._cell_size, self._cell_size * 2]) b_ru = vs.get_variable( "b_ru", [self._cell_size * 2], initializer=init_ops.constant_initializer(1.0)) w_c = vs.get_variable("w_c", [input_size + self._cell_size, self._cell_size]) b_c = vs.get_variable( "b_c", [self._cell_size], initializer=init_ops.constant_initializer(0.0)) _gru_block_cell = gen_gru_ops.gru_block_cell # pylint: disable=invalid-name _, _, _, new_h = _gru_block_cell( x=x, h_prev=h_prev, w_ru=w_ru, w_c=w_c, b_ru=b_ru, b_c=b_c) return new_h, new_h
def _annotated_graph(self): graph = ops.Graph() with graph.as_default(): random_seed.set_random_seed(2) current_activation = variable_scope.get_variable( name='start', shape=[1, 2, 2, 5]) conv_filter = variable_scope.get_variable( name='filter', shape=[5, 5, 5, 5]) for layer_number in range(3): with variable_scope.variable_scope('layer_{}'.format(layer_number)): after_conv = nn.conv2d(current_activation, conv_filter, [1, 1, 1, 1], 'SAME') current_activation = 2. * after_conv current_activation.op._set_attr( '_recompute_hint', # The value of the attribute does not matter; just that the key # exists in the op's attributes. attr_value_pb2.AttrValue(i=1)) current_activation += 5. current_activation.op._set_attr( '_recompute_hint', attr_value_pb2.AttrValue(i=0)) current_activation = nn.relu(current_activation) current_activation.op._set_attr( '_recompute_hint', attr_value_pb2.AttrValue(i=1)) loss = math_ops.reduce_mean(current_activation) optimizer = train.AdamOptimizer(0.001) train_op = optimizer.minimize(loss) init_op = variables.global_variables_initializer() return graph, init_op, train_op
def testPartitionConcatenatesAlongCorrectAxis(self): def _part_axis_0(**unused_kwargs): return (2, 1, 1) def _part_axis_1(**unused_kwargs): return (1, 2, 1) with variable_scope.variable_scope("root"): v0 = variable_scope.get_variable( "n0", shape=(2, 2, 2), partitioner=_part_axis_0) v1 = variable_scope.get_variable( "n1", shape=(2, 2, 2), partitioner=_part_axis_1) self.assertEqual(v0.get_shape(), (2, 2, 2)) self.assertEqual(v1.get_shape(), (2, 2, 2)) n0_0 = ops.get_default_graph().get_tensor_by_name("root/n0/part_0:0") n0_1 = ops.get_default_graph().get_tensor_by_name("root/n0/part_1:0") self.assertEqual(n0_0.get_shape(), (1, 2, 2)) self.assertEqual(n0_1.get_shape(), (1, 2, 2)) n1_0 = ops.get_default_graph().get_tensor_by_name("root/n1/part_0:0") n1_1 = ops.get_default_graph().get_tensor_by_name("root/n1/part_1:0") self.assertEqual(n1_0.get_shape(), (2, 1, 2)) self.assertEqual(n1_1.get_shape(), (2, 1, 2))
def testErrorConditions(self): self.assertRaises(ValueError, ws_util._WarmStartSettings, None) x = variable_scope.get_variable( "x", shape=[4, 1], initializer=ones(), partitioner=lambda shape, dtype: [2, 1]) # List of PartitionedVariable is invalid type when warmstarting with vocab. self.assertRaises(TypeError, ws_util._warmstart_var_with_vocab, [x], "/tmp", 5, "/tmp", "/tmp") # Keys of type other than FeatureColumn. self.assertRaises(TypeError, ws_util._warmstart, {"StringType": x}, ws_util._WarmStartSettings("/tmp")) # Unused variable names raises ValueError. with ops.Graph().as_default(): with self.test_session() as sess: x = variable_scope.get_variable( "x", shape=[4, 1], initializer=ones(), partitioner=lambda shape, dtype: [2, 1]) self._write_checkpoint(sess) self.assertRaises(ValueError, ws_util._warmstart, ws_util._WarmStartSettings( self.get_temp_dir(), var_name_to_vocab_info={ "y": ws_util._VocabInfo("", 1, 0, "") })) self.assertRaises(ValueError, ws_util._warmstart, ws_util._WarmStartSettings( self.get_temp_dir(), var_name_to_prev_var_name={"y": "y2"}))
def build(self, input_shape): if len(input_shape) != self.rank + 2: raise ValueError('Inputs should have rank ' + str(self.rank + 2) + 'Received input shape:', str(input_shape)) if self.data_format == 'channels_first': channel_axis = 1 else: channel_axis = -1 if input_shape[channel_axis] is None: raise ValueError('The channel dimension of the inputs ' 'should be defined. Found `None`.') input_dim = input_shape[channel_axis] kernel_shape = self.kernel_size + (input_dim, self.filters) self.kernel = vs.get_variable('kernel', shape=kernel_shape, initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, trainable=True, dtype=self.dtype) if self.use_bias: self.bias = vs.get_variable('bias', shape=(self.filters,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, trainable=True, dtype=self.dtype) else: self.bias = None
def testWarmStartMoreSettingsNoPartitioning(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) prev_keys_val = sess.run(sc_keys_weights) # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner=None) vocab_info = ws_util._VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=prev_vocab_path ) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), vars_to_warmstart=".*(sc_keys|sc_vocab).*", var_name_to_vocab_info={ ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info }, var_name_to_prev_var_name={ ws_util._infer_var_name(cols_to_vars[sc_keys]): "some_other_name" }) ws_util._warmstart(ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # sc_hash should not be warm-started. Var corresponding to sc_vocab # should be correctly warmstarted after vocab remapping. self._assert_cols_to_vars(cols_to_vars, { sc_keys: [prev_keys_val], sc_hash: [np.zeros([15, 1])], sc_vocab: [np.array([[3.], [2.], [1.], [0.5], [0.], [0.]])] }, sess)
def testRestoreOnAssign(self): checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") save_graph = ops.Graph() with save_graph.as_default(), self.test_session(save_graph): first = checkpointable.Checkpointable() first.var1 = variable_scope.get_variable( name="outside_var", initializer=0.) first.var2 = variable_scope.get_variable( name="blah", initializer=0.) self.evaluate(first.var1.assign(4.)) self.evaluate(first.var2.assign(8.)) save_path = checkpointable_utils.CheckpointableSaver(first).save( checkpoint_prefix) restore_graph = ops.Graph() with restore_graph.as_default(), self.test_session(restore_graph): second = checkpointable.Checkpointable() second.var2 = variable_scope.get_variable( name="blah", initializer=0.) status = checkpointable_utils.CheckpointableSaver( second).restore(save_path) recreated_var1 = variable_scope.get_variable( name="outside_var", initializer=0.) status.run_restore_ops() self.assertEqual(8., self.evaluate(second.var2)) self.evaluate(recreated_var1.assign(-2.)) self.assertEqual(-2., self.evaluate(recreated_var1)) second.var1 = recreated_var1 status.run_restore_ops() self.assertEqual(4., self.evaluate(recreated_var1))
def batch_normalize(tensor_in, epsilon=1e-5, convnet=False, decay=0.9, scale_after_normalization=True): """Batch Normalization Args: tensor_in: input Tensor, 4D shape: [batch, in_height, in_width, in_depth]. epsilon : A float number to avoid being divided by 0. decay: decay rate for exponential moving average. convnet: Whether this is for convolutional net use. If this is True, moments will sum across axis [0, 1, 2]. Otherwise, only [0]. scale_after_normalization: Whether to scale after normalization. """ shape = tensor_in.get_shape().as_list() with vs.variable_scope("batch_norm"): gamma = vs.get_variable("gamma", [shape[-1]], initializer=init_ops.random_normal_initializer(1.0, 0.02)) beta = vs.get_variable("beta", [shape[-1]], initializer=init_ops.constant_initializer(0.0)) ema = moving_averages.ExponentialMovingAverage(decay=decay) if convnet: assign_mean, assign_var = nn.moments(tensor_in, [0, 1, 2]) else: assign_mean, assign_var = nn.moments(tensor_in, [0]) ema_assign_op = ema.apply([assign_mean, assign_var]) ema_mean, ema_var = ema.average(assign_mean), ema.average(assign_var) def update_mean_var(): """Internal function that updates mean and variance during training""" with ops.control_dependencies([ema_assign_op]): return array_ops_.identity(assign_mean), array_ops_.identity(assign_var) is_training = array_ops_.squeeze(ops.get_collection("IS_TRAINING")) mean, variance = control_flow_ops.cond(is_training, update_mean_var, lambda: (ema_mean, ema_var)) return nn.batch_norm_with_global_normalization( tensor_in, mean, variance, beta, gamma, epsilon, scale_after_normalization=scale_after_normalization )
def testOptimizerInit(self): with ops.Graph().as_default(): layer_collection = lc.LayerCollection() inputs = array_ops.ones((2, 1)) * 2 weights_val = np.ones((1, 1), dtype=np.float32) * 3. weights = variable_scope.get_variable( 'w', initializer=array_ops.constant(weights_val)) bias = variable_scope.get_variable( 'b', initializer=init_ops.zeros_initializer(), shape=(1, 1)) output = math_ops.matmul(inputs, weights) + bias layer_collection.register_fully_connected((weights, bias), inputs, output) logits = math_ops.tanh(output) targets = array_ops.constant([[0.], [1.]]) output = math_ops.reduce_mean( nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets)) layer_collection.register_categorical_predictive_distribution(logits) optimizer.KfacOptimizer( 0.1, 0.2, 0.3, layer_collection, momentum=0.5, momentum_type='regular')
def _project_input(self, inputs, c_prev, m_prev, with_c): """Fills in c_prev and m_prev with projected input, for input dimensions """ conf = self._config if (inputs is not None and inputs.get_shape().with_rank(2)[1].value > 0 and len(conf.inputs) > 0): if isinstance(inputs, tuple): if len(conf.inputs) != len(inputs): raise ValueError("Expect inputs as a tuple of {} " "tensors".format(len(conf.inputs))) input_splits = inputs else: input_splits = array_ops.split( value=inputs, num_or_size_splits=len(conf.inputs), axis=1) input_sz = input_splits[0].get_shape().with_rank(2)[1].value for i, j in enumerate(conf.inputs): input_project_m = vs.get_variable( 'project_m_{}'.format(j), [input_sz, conf.num_units], dtype=inputs.dtype) m_prev[j] = math_ops.matmul(input_splits[i], input_project_m) if with_c: input_project_c = vs.get_variable( 'project_c_{}'.format(j), [input_sz, conf.num_units], dtype=inputs.dtype) c_prev[j] = math_ops.matmul(input_splits[i], input_project_c)
def weighted_moving_average(value, decay, weight, truediv=True, collections=None, name=None): """Compute the weighted moving average of `value`. Conceptually, the weighted moving average is: `moving_average(value * weight) / moving_average(weight)`, where a moving average updates by the rule `new_value = decay * old_value + (1 - decay) * update` Internally, this Op keeps moving average variables of both `value * weight` and `weight`. Args: value: A numeric `Tensor`. decay: A float `Tensor` or float value. The moving average decay. weight: `Tensor` that keeps the current value of a weight. Shape should be able to multiply `value`. truediv: Boolean, if `True`, dividing by `moving_average(weight)` is floating point division. If `False`, use division implied by dtypes. collections: List of graph collections keys to add the internal variables `value * weight` and `weight` to. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`. name: Optional name of the returned operation. Defaults to "WeightedMovingAvg". Returns: An Operation that updates and returns the weighted moving average. """ # Unlike assign_moving_average, the weighted moving average doesn't modify # user-visible variables. It is the ratio of two internal variables, which are # moving averages of the updates. Thus, the signature of this function is # quite different than assign_moving_average. if collections is None: collections = [ops.GraphKeys.GLOBAL_VARIABLES] with variable_scope.variable_scope(name, "WeightedMovingAvg", [value, weight, decay]) as scope: value_x_weight_var = variable_scope.get_variable( "value_x_weight", initializer=init_ops.zeros_initializer(value.get_shape(), dtype=value.dtype), trainable=False, collections=collections) weight_var = variable_scope.get_variable( "weight", initializer=init_ops.zeros_initializer(weight.get_shape(), dtype=weight.dtype), trainable=False, collections=collections) numerator = assign_moving_average( value_x_weight_var, value * weight, decay, zero_debias=False) denominator = assign_moving_average( weight_var, weight, decay, zero_debias=False) if truediv: return math_ops.truediv(numerator, denominator, name=scope.name) else: return math_ops.div(numerator, denominator, name=scope.name)
def weighted_moving_average(value, decay, weight, truediv=True, collections=None, name=None): """Compute the weighted moving average of `value`. Conceptually, the weighted moving average is: `moving_average(value * weight) / moving_average(weight)`, where a moving average updates by the rule `new_value = decay * old_value + (1 - decay) * update` Internally, this Op keeps moving average variables of both `value * weight` and `weight`. Args: value: A numeric `Tensor`. decay: A float `Tensor` or float value. The moving average decay. weight: `Tensor` that keeps the current value of a weight. Shape should be able to multiply `value`. truediv: Boolean, if `True`, dividing by `moving_average(weight)` is floating point division. If `False`, use division implied by dtypes. collections: List of graph collections keys to add the internal variables `value * weight` and `weight` to. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`. name: Optional name of the returned operation. Defaults to "WeightedMovingAvg". Returns: An Operation that updates and returns the weighted moving average. """ # Unlike assign_moving_average, the weighted moving average doesn't modify # user-visible variables. It is the ratio of two internal variables, which are # moving averages of the updates. Thus, the signature of this function is # quite different than assign_moving_average. if collections is None: collections = [ops.GraphKeys.GLOBAL_VARIABLES] with variable_scope.variable_scope(name, "WeightedMovingAvg", [value, weight, decay]) as scope: value_x_weight_var = variable_scope.get_variable( "value_x_weight", shape=value.get_shape(), dtype=value.dtype, initializer=init_ops.zeros_initializer(), trainable=False, collections=collections) weight_var = variable_scope.get_variable( "weight", shape=weight.get_shape(), dtype=weight.dtype, initializer=init_ops.zeros_initializer(), trainable=False, collections=collections) numerator = assign_moving_average(value_x_weight_var, value * weight, decay, zero_debias=False) denominator = assign_moving_average(weight_var, weight, decay, zero_debias=False) if truediv: return math_ops.truediv(numerator, denominator, name=scope.name) else: return math_ops.divide(numerator, denominator, name=scope.name)
def variable_scoped_function(): return variable_scope.get_variable( "dummy", shape=[1], initializer=init_ops.zeros_initializer())
def call(self, inputs): variable_scope.get_variable( 'my_call_var', [2, 2], initializer=init_ops.zeros_initializer()) return inputs
def build(self, input_shape): self.my_var = variable_scope.get_variable( 'my_var', [2, 2], initializer=init_ops.zeros_initializer())
def legacy_convolution2d(x, num_output_channels, kernel_size, activation_fn=None, stride=(1, 1), padding='SAME', weight_init=initializers.xavier_initializer_conv2d(), bias_init=standard_ops.zeros_initializer, name=None, weight_collections=(ops.GraphKeys.WEIGHTS, ), bias_collections=(ops.GraphKeys.BIASES, ), output_collections=(ops.GraphKeys.ACTIVATIONS, ), trainable=True, weight_regularizer=None, bias_regularizer=None): # pylint: disable=g-docstring-has-escape """Adds the parameters for a conv2d layer and returns the output. A neural network convolution layer is generally defined as: \\\\(y = f(conv2d(w, x) + b)\\\\) where **f** is given by `activation_fn`, **conv2d** is `tf.nn.conv2d` and `x` has shape `[batch, height, width, channels]`. The output of this op is of shape `[batch, out_height, out_width, num_output_channels]`, where `out_width` and `out_height` are determined by the `padding` argument. See `conv2D` for details. This op creates `w` and optionally `b` and adds various summaries that can be useful for visualizing learning or diagnosing training problems. Bias can be disabled by setting `bias_init` to `None`. The variable creation is compatible with `tf.variable_scope` and so can be reused with `tf.variable_scope` or `tf.make_template`. Most of the details of variable creation can be controlled by specifying the initializers (`weight_init` and `bias_init`) and which collections to place the created variables in (`weight_collections` and `bias_collections`). A per layer regularization can be specified by setting `weight_regularizer`. This is only applied to weights and not the bias. Args: x: A 4-D input `Tensor`. num_output_channels: The number of output channels (i.e. the size of the last dimension of the output). kernel_size: A length 2 `list` or `tuple` containing the kernel size. activation_fn: A function that requires a single Tensor that is applied as a non-linearity. stride: A length 2 `list` or `tuple` specifying the stride of the sliding window across the image. padding: A `string` from: "SAME", "VALID". The type of padding algorithm to use. weight_init: An optional initialization. If not specified, uses Xavier initialization (see `tf.learn.xavier_initializer`). bias_init: An initializer for the bias, defaults to 0. Set to`None` in order to disable bias. name: The name for this operation is used to name operations and to find variables. If specified it must be unique for this scope, otherwise a unique name starting with "convolution2d" will be created. See `tf.variable_op_scope` for details. weight_collections: List of graph collections to which weights are added. bias_collections: List of graph collections to which biases are added. output_collections: List of graph collections to which outputs are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). weight_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for weights. bias_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for biases. Returns: The result of applying a 2-D convolutional layer. Raises: ValueError: If `kernel_size` or `stride` are not length 2. """ # TODO(ptucker) redirect to convolution2d # _ = trainable # variables_collections = {'weights': weight_collections, # 'biases': bias_collections} # outputs = convolution2d(inputs=x, # num_outputs=num_output_channels, # kernel_size=kernel_size, # stride=stride, # padding=padding, # activation_fn=activation_fn, # weights_initializer=weight_init, # weights_regularizer=weight_regularizer, # biases_initializer=bias_init, # biases_regularizer=bias_regularizer, # variables_collections=variables_collections, # scope=name) # ops.add_to_collections(output_collections, outputs) # return outputs with variable_scope.variable_op_scope([x], name, 'convolution2d'): num_input_channels = x.get_shape().dims[3].value if len(kernel_size) != 2: raise ValueError('kernel_size must be length 2: %d ' % kernel_size) if len(stride) != 2: raise ValueError('stride must be length 2: %d' % stride) stride = [1, stride[0], stride[1], 1] shape = [ kernel_size[0], kernel_size[1], num_input_channels, num_output_channels ] dtype = x.dtype.base_dtype weight_collections = set( list(weight_collections or []) + [ops.GraphKeys.VARIABLES]) w = variable_scope.get_variable('weights', shape=shape, dtype=dtype, initializer=weight_init, collections=weight_collections, regularizer=weight_regularizer, trainable=trainable) y = nn.conv2d(x, w, stride, padding) if bias_init is not None: bias_collections = set( list(bias_collections or []) + [ops.GraphKeys.VARIABLES]) b = variable_scope.get_variable('bias', shape=[num_output_channels], dtype=dtype, initializer=bias_init, collections=bias_collections, regularizer=bias_regularizer, trainable=trainable) y = nn.bias_add(y, b) return _apply_activation(y, activation_fn, output_collections)
def legacy_fully_connected(x, num_output_units, activation_fn=None, weight_init=initializers.xavier_initializer(), bias_init=init_ops.zeros_initializer, name=None, weight_collections=(ops.GraphKeys.WEIGHTS, ), bias_collections=(ops.GraphKeys.BIASES, ), output_collections=(ops.GraphKeys.ACTIVATIONS, ), trainable=True, weight_regularizer=None, bias_regularizer=None): # pylint: disable=anomalous-backslash-in-string r"""Adds the parameters for a fully connected layer and returns the output. A fully connected layer is generally defined as a matrix multiply: `y = f(w * x + b)` where `f` is given by `activation_fn`. If `activation_fn` is `None`, the result of `y = w * x + b` is returned. If `x` has shape [\\\(\\text{dim}_0, \\text{dim}_1, ..., \\text{dim}_n\\\)] with more than 2 dimensions (\\\(n > 1\\\)), then we repeat the matrix multiply along the first dimensions. The result r is a tensor of shape [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`], where \\\( r_{i_0, ..., i_{n-1}, k} = \\sum_{0 \\leq j < \\text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\\). This is accomplished by reshaping `x` to 2-D [\\\(\\text{dim}_0 \\cdot ... \\cdot \\text{dim}_{n-1}, \\text{dim}_n\\\)] before the matrix multiply and afterwards reshaping it to [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`]. This op creates `w` and optionally `b`. Bias (`b`) can be disabled by setting `bias_init` to `None`. The variable creation is compatible with `tf.variable_scope` and so can be reused with `tf.variable_scope` or `tf.make_template`. Most of the details of variable creation can be controlled by specifying the initializers (`weight_init` and `bias_init`) and in which collections to place the created variables (`weight_collections` and `bias_collections`; note that the variables are always added to the `VARIABLES` collection). The output of the layer can be placed in custom collections using `output_collections`. The collections arguments default to `WEIGHTS`, `BIASES` and `ACTIVATIONS`, respectively. A per layer regularization can be specified by setting `weight_regularizer` and `bias_regularizer`, which are applied to the weights and biases respectively, and whose output is added to the `REGULARIZATION_LOSSES` collection. Args: x: The input `Tensor`. num_output_units: The size of the output. activation_fn: A function that requires a single Tensor that is applied as a non-linearity. If None is used, do not apply any activation. weight_init: An optional weight initialization, defaults to `xavier_initializer`. bias_init: An initializer for the bias, defaults to 0. Set to `None` in order to disable bias. name: The name for this operation is used to name operations and to find variables. If specified it must be unique for this scope, otherwise a unique name starting with "fully_connected" will be created. See `tf.variable_op_scope` for details. weight_collections: List of graph collections to which weights are added. bias_collections: List of graph collections to which biases are added. output_collections: List of graph collections to which outputs are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). weight_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for weights. bias_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for biases. Returns: The output of the fully connected layer. Raises: ValueError: if x has rank less than 2 or if its last dimension is not set. """ # pylint: enable=anomalous-backslash-in-string # TODO(ptucker) redirect to fully_connected # _ = trainable # variables_collections = {'weights': weight_collections, # 'biases': bias_collections} # outputs = fully_connected(inputs=x, # num_outputs=num_output_units, # activation_fn=activation_fn, # weights_initializer=weight_init, # weights_regularizer=weight_regularizer, # biases_initializer=bias_init, # biases_regularizer=bias_regularizer, # variables_collections=variables_collections, # scope=name) # ops.add_to_collections(output_collections, outputs) # return outputs with variable_scope.variable_op_scope([x], name, 'fully_connected'): dims = x.get_shape().dims if dims is None: raise ValueError('dims of x must be known but is None') if len(dims) < 2: raise ValueError('rank of x must be at least 2 not: %d' % len(dims)) num_input_units = dims[-1].value if num_input_units is None: raise ValueError('last dimension of x must be known but is None') dtype = x.dtype.base_dtype weight_collections = set( list(weight_collections or []) + [ops.GraphKeys.VARIABLES]) w = variable_scope.get_variable( 'weights', shape=[num_input_units, num_output_units], dtype=dtype, initializer=weight_init, collections=weight_collections, regularizer=weight_regularizer, trainable=trainable) x_2_dim = x if len(dims) <= 2 else array_ops.reshape( x, [-1, num_input_units]) y = standard_ops.matmul(x_2_dim, w) if bias_init is not None: bias_collections = set( list(bias_collections or []) + [ops.GraphKeys.VARIABLES]) b = variable_scope.get_variable('bias', shape=[num_output_units], dtype=dtype, initializer=bias_init, collections=bias_collections, regularizer=bias_regularizer, trainable=trainable) y = nn.bias_add(y, b) if len(dims) > 2: out_shape = array_ops.unpack(array_ops.shape(x)) out_shape[-1] = num_output_units y = array_ops.reshape(y, array_ops.pack(out_shape)) static_shape = x.get_shape().as_list() static_shape[-1] = num_output_units y.set_shape(static_shape) return _apply_activation(y, activation_fn, output_collections)
def _zero_debias(strategy, unbiased_var, value, decay): """Compute the delta required for a debiased Variable. All exponential moving averages initialized with Tensors are initialized to 0, and therefore are biased to 0. Variables initialized to 0 and used as EMAs are similarly biased. This function creates the debias updated amount according to a scale factor, as in (Kingma et al., 2015). To demonstrate the bias the results from 0-initialization, take an EMA that was initialized to `0` with decay `b`. After `t` timesteps of seeing the constant `c`, the variable have the following value: ``` EMA = 0*b^(t) + c*(1 - b)*b^(t-1) + c*(1 - b)*b^(t-2) + ... = c*(1 - b^t) ``` To have the true value `c`, we would divide by the scale factor `1 - b^t`. In order to perform debiasing, we use two shadow variables. One keeps track of the biased estimate, and the other keeps track of the number of updates that have occurred. Args: strategy: `Strategy` used to create and update variables. unbiased_var: A Variable representing the current value of the unbiased EMA. value: A Tensor representing the most recent value. decay: A Tensor representing `1-decay` for the EMA. Returns: The amount that the unbiased variable should be updated. Computing this tensor will also update the shadow variables appropriately. References: Adam - A Method for Stochastic Optimization: [Kingma et al., 2015](https://arxiv.org/abs/1412.6980) ([pdf](https://arxiv.org/pdf/1412.6980.pdf)) """ with variable_scope.variable_scope(unbiased_var.name[:-len(":0")], values=[unbiased_var, value, decay]): with ops.init_scope(): biased_initializer = init_ops.zeros_initializer() local_step_initializer = init_ops.zeros_initializer() def _maybe_get_unique(name): """Get name for a unique variable, if not `reuse=True`.""" if variable_scope.get_variable_scope().reuse: return name vs_vars = [ x.op.name for x in variable_scope.get_variable_scope().global_variables() ] full_name = variable_scope.get_variable_scope().name + "/" + name if full_name not in vs_vars: return name idx = 1 while full_name + ("_%d" % idx) in vs_vars: idx += 1 return name + ("_%d" % idx) with strategy.extended.colocate_vars_with(unbiased_var): biased_var = variable_scope.get_variable( _maybe_get_unique("biased"), initializer=biased_initializer, shape=unbiased_var.get_shape(), dtype=unbiased_var.dtype, trainable=False) local_step = variable_scope.get_variable( _maybe_get_unique("local_step"), shape=[], dtype=unbiased_var.dtype, initializer=local_step_initializer, trainable=False) def update_fn(v, value, biased_var, local_step): update_biased = state_ops.assign_sub(biased_var, (biased_var - value) * decay) update_local_step = local_step.assign_add(1) # This function gets `1 - decay`, so use `1.0 - decay` in the exponent. bias_factor = 1 - math_ops.pow(1.0 - decay, update_local_step) return state_ops.assign(v, update_biased / bias_factor, name=ops.get_name_scope() + "/") return strategy.extended.update(unbiased_var, update_fn, args=(value, biased_var, local_step))
def _linear(args, output_size, bias, bias_initializer=None, kernel_initializer=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_initializer: starting value to initialize the bias (default is all zeros). kernel_initializer: starting value to initialize the weight. Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if (shape.ndims != 2): raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError( "linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: weights = vs.get_variable(_WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype, initializer=kernel_initializer) if len(args) == 1: res = math_ops.matmul(args[0], weights) else: res = math_ops.matmul(array_ops.concat(args, 1), weights) if not bias: return res with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype) biases = vs.get_variable(_BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer) return nn_ops.bias_add(res, biases)
def optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, variables=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of tf.Optimizer that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantion of tf.Optimizer sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. clip_gradients: float or None, clips gradients by this value. moving_average_decay: float or None, takes into account previous loss to make learning smoother due to outliers. learning_rate_decay_fn: function, takes learning_rate and global_step Tensors, returns Tensor. Can be used to implement any learning rate decay functions. For example: tf.train.exponential_decay. variables: list of variables to optimizer or none. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ # Moving average of the loss with decay. if moving_average_decay is not None: # Generate moving averages of the loss. loss_averages = train.ExponentialMovingAverage(moving_average_decay, name="avg") loss_averages_op = loss_averages.apply([loss]) logging_ops.scalar_summary("loss/mean", loss_averages.average(loss)) loss = control_flow_ops.with_dependencies([loss_averages_op], loss) # Learning rate variable, with possible decay. if isinstance(learning_rate, ops.Tensor) and len( learning_rate.get_shape()) == 0: lr = learning_rate elif isinstance(learning_rate, float): lr = vs.get_variable( "learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. Got %s" % str(learning_rate)) if learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer): opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer or instance of " "subclass of Optimizer. Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients. gradients = opt.compute_gradients(loss, variables) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale) # Optionally clip gradients. if clip_gradients is not None: gradients, variables = zip(*gradients) clipped_gradients, _ = clip_ops.clip_by_global_norm( gradients, clip_gradients) gradients = list(zip(clipped_gradients, variables)) # Add scalar summary for loss. logging_ops.scalar_summary("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: logging_ops.histogram_summary(variable.name, variable) logging_ops.histogram_summary(variable.name + "/gradients", grad_values) logging_ops.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values])) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # Make sure total_loss is valid. final_loss = array_ops.check_numerics(loss, "Loss is inf or nan") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss) return train_tensor
def __call__(self, inputs, state, scope=None): """Run one step of MemoryLSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ (a, h_prev_summary, c_tape_prev, h_tape_prev) = state dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" concat_w = rnn_cell._get_concat_variable( "W", [input_size.value + self._num_units, 4 * self._num_units], dtype, 1) b = vs.get_variable("Bias", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) # reshape tape to 3D c_tape_prev = array_ops.reshape(c_tape_prev, [-1, self._attn_length, self._num_units]) h_tape_prev = array_ops.reshape(h_tape_prev, [-1, self._attn_length, self._num_units]) a, new_c_summary, new_h_summary = self._attention(inputs, h_prev_summary, c_tape_prev, h_tape_prev) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [inputs, new_h_summary]) lstm_matrix = tf.nn.bias_add(math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable( "W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * new_c_summary) * new_c_summary + sigmoid(i + w_i_diag * new_c_summary) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * new_c_summary + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: c = tf.clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) if self._use_peepholes: h = sigmoid(o + w_o_diag * c) * self._activation(c) else: h = sigmoid(o) * self._activation(c) # remove old value new_h_tape = array_ops.slice(h_tape_prev, [0, 1, 0], [-1, -1, -1]) new_c_tape = array_ops.slice(c_tape_prev, [0, 1, 0], [-1, -1, -1]) # append the new c and h to the tape new_c_tape = array_ops.concat(1, [new_c_tape, array_ops.expand_dims(c, 1)]) new_h_tape = array_ops.concat(1, [new_h_tape, array_ops.expand_dims(h, 1)]) # flatten the tape to 2D new_c_tape = array_ops.reshape(new_c_tape, [-1, self._attn_length * self._num_units]) new_h_tape = array_ops.reshape(new_h_tape, [-1, self._attn_length * self._num_units]) new_state = (a, new_h_summary, new_c_tape, new_h_tape) return h, new_state
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" concat_w = _get_concat_variable( "W", [input_size.value + num_proj, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable("B", shape=[4 * self._num_units], initializer=init_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [inputs, m_prev]) lstm_matrix = nn_ops.bias_add( math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable("W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable("W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable("W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: concat_w_proj = _get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = math_ops.matmul(m, concat_w_proj) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat(1, [c, m])) return m, new_state
def testInitFromPartitionVar(self): checkpoint_dir = self.get_temp_dir() with self.cached_session() as session: v1 = _create_partition_checkpoints(session, checkpoint_dir) # New graph and session. with ops.Graph().as_default() as g: with self.session(graph=g) as session: with variable_scope.variable_scope("some_scope"): my1 = variable_scope.get_variable( name="my1", shape=[100, 100], initializer=init_ops.zeros_initializer(), partitioner=partitioned_variables. min_max_variable_partitioner(max_partitions=5, axis=0, min_slice_size=8 << 10)) my1_var_list = my1._get_variable_list() # Create another variable with different partitions than the variable in # the checkpoint. with variable_scope.variable_scope("some_other_scope"): my2 = variable_scope.get_variable( name="var1", shape=[100, 100], initializer=init_ops.zeros_initializer(), partitioner=partitioned_variables. min_max_variable_partitioner(max_partitions=5, axis=0, min_slice_size=16 << 10)) my2_var_list = my2._get_variable_list() checkpoint_utils.init_from_checkpoint( checkpoint_dir, { "scope/var1": "some_scope/my1", "scope/": "some_other_scope/" }) session.run(variables.global_variables_initializer()) my1_values = session.run(my1_var_list) self.assertAllEqual(my1_values, v1) my2_values = session.run(my2_var_list) # Verify we created different number of partitions. self.assertNotEquals(len(my2_values), len(v1)) # Verify the values were correctly initialized inspite of different # partitions. full_my2_values = np.concatenate(my2_values, axis=0) full_v1_values = np.concatenate(v1, axis=0) self.assertAllEqual(full_my2_values, full_v1_values) # New graph and session. with ops.Graph().as_default() as g: with self.session(graph=g) as session: with variable_scope.variable_scope("some_scope"): my1 = variable_scope.get_variable( name="my1", shape=[100, 100], initializer=init_ops.truncated_normal_initializer(0.5), partitioner=partitioned_variables. min_max_variable_partitioner(max_partitions=5, axis=0, min_slice_size=8 << 10)) my1_var_list = my1._get_variable_list() checkpoint_utils.init_from_checkpoint( checkpoint_dir, { "scope/var1": my1_var_list, }) session.run(variables.global_variables_initializer()) my1_values = session.run(my1_var_list) self.assertAllEqual(my1_values, v1)
def testResourceCountsAreCorrect(self): with self.session() as sess: with ops.device("/device:IPU:0"): with variable_scope.variable_scope("vs", use_resource=True): w1 = variable_scope.get_variable( "w1", shape=[4, 2], dtype=np.float32, initializer=init_ops.constant_initializer( np.array([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=np.float32))) b1 = variable_scope.get_variable( "b1", shape=[2], dtype=np.float32, trainable=False, initializer=init_ops.constant_initializer( np.array([2, 3], dtype=np.float32))) w2 = variable_scope.get_variable( "w2", shape=[2, 2], dtype=np.float32, initializer=init_ops.constant_initializer( np.array([[1, 2], [3, 4]], dtype=np.float32))) b2 = variable_scope.get_variable( "b2", shape=[2], dtype=np.float32, trainable=False, initializer=init_ops.constant_initializer( np.array([2, 3], dtype=np.float32))) x = array_ops.placeholder(np.float32, shape=[1, 4]) y = math_ops.matmul(x, w1) + b1 y = math_ops.matmul(y, w2) + b2 loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) report = tu.ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() sess.run([train, loss], {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[1, 2, 3, 4]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[1, 2, 3, 4]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) report.parse_log() report.assert_host_to_device_event_names([]) report.assert_device_to_host_event_names([]) # Explicitly fetch the first set of weights and biases sess.run([w1, b1]) report.parse_log() report.assert_host_to_device_event_names([]) report.assert_device_to_host_event_names([])
from tensorflow.python.ops import state_ops from tensorflow.python.ops import variable_scope from tensorflow.python.platform import test from tensorflow.python.tpu import tpu_feed def create_test_xla_compile_context(): computation_name = ops.get_default_graph().unique_name('computation') pivot = control_flow_ops.no_op(name=computation_name + '/pivot') return xla.XLACompileContext(name=computation_name, pivot=pivot) a = variable_scope.get_variable(name='variable_a', use_resource=True, initializer=1) context = create_test_xla_compile_context() context.Enter() a.assign(2) context.Exit() @def_function.function def func(): context = create_test_xla_compile_context() context.Enter() o = a.assign(2) context.Exit() return o
def function_with_create(trainable): """Creates a variable as a side effect using tf.Variable.""" variables.Variable(0, trainable=trainable) return variable_scope.get_variable( "dummy", shape=[1], initializer=init_ops.zeros_initializer())
def initialize_graph(self, input_statistics=None): super(StubTimeSeriesModel, self).initialize_graph( input_statistics=input_statistics) self.prior_var = variable_scope.get_variable( "prior", [], initializer=init_ops.constant_initializer(0.))
def test_no_variable_sharing(self): variable_scope.get_variable(name="step_size", initializer=np.array(1e-5, np.float32), use_resource=True, trainable=False)
def optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None, colocate_gradients_with_ops=False, increment_global_step=True, LARS_nu=None, LARS_epsilon=1.0/16384.0, loss_scale=1.0): """Given loss and parameters for optimizer, returns a training op. Various ways of passing optimizers include: - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES for full list. E.g. `optimize_loss(..., optimizer='Adam')`. - by function taking learning rate `Tensor` as argument and returning an `Optimizer` instance. E.g. `optimize_loss(..., optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`. Alternatively, if `learning_rate` is `None`, the function takes no arguments. E.g. `optimize_loss(..., learning_rate=None, optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`. - by a subclass of `Optimizer` having a single-argument constructor (the argument is the learning rate), such as AdamOptimizer or AdagradOptimizer. E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`. - by an instance of a subclass of `Optimizer`. E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`. Args: loss: Scalar `Tensor`. global_step: Scalar int `Tensor`, step counter to update on each step unless `increment_global_step` is `False`. If not supplied, it will be fetched from the default graph (see `tf.train.get_global_step` for details). If it has not been created, no step will be incremented with each weight update. `learning_rate_decay_fn` requires `global_step`. learning_rate: float or `Tensor`, magnitude of update per each training step. Can be `None`. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of `tf.Optimizer` that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantiation of `tf.Optimizer` sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. gradient_multipliers: dict of variables or variable names to floats. If present, gradients for specified variables will be multiplied by given constant. clip_gradients: float, callable or `None`. If float, is provided, a global clipping is applied to prevent the norm of the gradient to exceed this value. Alternatively, a callable can be provided e.g.: adaptive_clipping. This callable takes a `list` of `(gradients, variables)` `tuple`s and returns the same thing with the gradients modified. learning_rate_decay_fn: function, takes `learning_rate` and `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: `tf.train.exponential_decay`. Ignored if `learning_rate` is not supplied. update_ops: list of update `Operation`s to execute at each step. If `None`, uses elements of UPDATE_OPS collection. The order of execution between `update_ops` and `loss` is non-deterministic. variables: list of variables to optimize or `None` to use all trainable variables. name: The name for this operation is used to scope operations and summaries. summaries: List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. increment_global_step: Whether to increment `global_step`. If your model calls `optimize_loss` multiple times per training step (e.g. to optimize different parts of the model), use this arg to avoid incrementing `global_step` more times than necessary. LARS_nu: If not None, LARS re-scaling will be applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARS_nu LARS_epsilon: If either weight or gradient norm is zero, this will be returned as local LR Returns: Training op. Raises: ValueError: if: * `loss` is an invalid type or shape. * `global_step` is an invalid type or shape. * `learning_rate` is an invalid type or value. * `optimizer` has the wrong type. * `clip_gradients` is neither float nor callable. * `learning_rate` and `learning_rate_decay_fn` are supplied, but no `global_step` is available. * `gradients` is empty. """ loss = ops.convert_to_tensor(loss) contrib_framework.assert_scalar(loss) if global_step is None: global_step = contrib_framework.get_global_step() else: contrib_framework.assert_global_step(global_step) with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): # Update ops take UPDATE_OPS collection if not provided. if update_ops is None: update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) # Make sure update ops are ran before computing loss. if update_ops: loss = control_flow_ops.with_dependencies(list(update_ops), loss) # Learning rate variable, with possible decay. lr = None if learning_rate is not None: if (isinstance(learning_rate, ops.Tensor) and learning_rate.get_shape().ndims == 0): lr = learning_rate elif isinstance(learning_rate, float): if learning_rate < 0.0: raise ValueError("Invalid learning_rate %s.", learning_rate) lr = vs.get_variable( "learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. " "Got %s of type %s" % (str(learning_rate), str(type(learning_rate)))) if summaries is None: summaries = ["loss", "learning_rate", "global_gradient_norm"] else: for summ in summaries: if summ not in OPTIMIZER_SUMMARIES: raise ValueError("Summaries should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_SUMMARIES), summ)) if learning_rate is not None and learning_rate_decay_fn is not None: if global_step is None: raise ValueError("global_step is required for learning_rate_decay_fn.") lr = learning_rate_decay_fn(lr, global_step) if "learning_rate" in summaries: summary.scalar("learning_rate", lr) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if lr is None: raise ValueError("Learning rate is None, but should be specified if " "optimizer is string (%s)." % optimizer) if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif (isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer)): if lr is None: raise ValueError("Learning rate is None, but should be specified if " "optimizer is class (%s)." % optimizer) opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer elif callable(optimizer): if learning_rate is not None: opt = optimizer(lr) else: opt = optimizer() if not isinstance(opt, optimizer_.Optimizer): raise ValueError("Unrecognized optimizer: function should return " "subclass of Optimizer. Got %s." % str(opt)) else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer, instance of " "subclass of Optimizer or function with one argument. " "Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients. gradients = opt.compute_gradients( loss if loss_scale==1.0 else loss_scale*loss, variables, colocate_gradients_with_ops=colocate_gradients_with_ops) if loss_scale!=1.0: gradients = _multiply_gradients_const(gradients, 1.0 / loss_scale) # LARS gradient re-scaling if LARS_nu is not None and isinstance(LARS_nu, float): for idx, (g, v) in enumerate(gradients): v_norm = linalg_ops.norm(tensor=v, ord=2) g_norm = linalg_ops.norm(tensor=g, ord=2) lars_local_lr = control_flow_ops.cond( pred = math_ops.logical_and(math_ops.not_equal(v_norm, array_ops.constant(0.0)), math_ops.not_equal(g_norm, array_ops.constant(0.0))), true_fn = lambda: LARS_nu * v_norm / g_norm, false_fn = lambda: LARS_epsilon) gradients[idx] = (math_ops.scalar_mul(lars_local_lr, g), v) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale) # Multiply some gradients. if gradient_multipliers is not None: gradients = _multiply_gradients(gradients, gradient_multipliers) if not gradients: raise ValueError( "Empty list of (gradient, var) pairs encountered. This is most " "likely to be caused by an improper value of gradient_multipliers.") if "global_gradient_norm" in summaries or "gradient_norm" in summaries: summary.scalar("global_norm/gradient_norm", clip_ops.global_norm(list(zip(*gradients))[0])) # Optionally clip gradients by global norm. if isinstance(clip_gradients, float): gradients = _clip_gradients_by_norm(gradients, clip_gradients) elif callable(clip_gradients): gradients = clip_gradients(gradients) elif clip_gradients is not None: raise ValueError( "Unknown type %s for clip_gradients" % type(clip_gradients)) # Add scalar summary for loss. if "loss" in summaries: summary.scalar("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: var_name = variable.name.replace(":", "_") if "gradients" in summaries: summary.histogram("gradients/%s" % var_name, grad_values) if "gradient_norm" in summaries: summary.scalar("gradient_norm/%s" % var_name, clip_ops.global_norm([grad_values])) if clip_gradients is not None and ("global_gradient_norm" in summaries or "gradient_norm" in summaries): summary.scalar("global_norm/clipped_gradient_norm", clip_ops.global_norm(list(zip(*gradients))[0])) # Create gradient updates. grad_updates = opt.apply_gradients( gradients, global_step=global_step if increment_global_step else None, name="train") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor
def _eunn_param(hidden_size, capacity=2, fft=False, comp=True): """ Create parameters and do the initial preparations """ theta_phi_initializer = init_ops.random_uniform_initializer(-np.pi, np.pi) if fft: capacity = int(np.ceil(np.log2(hidden_size))) diag_list_0 = [] off_list_0 = [] varsize = 0 for i in range(capacity): size = capacity - i normal_size = (hidden_size // (2 ** size)) * (2 ** (size - 1)) extra_size = max(0, (hidden_size % (2 ** size)) - (2 ** (size - 1))) varsize += normal_size + extra_size params_theta = vs.get_variable("theta_0", [varsize], initializer=theta_phi_initializer) cos_theta = math_ops.cos(params_theta) sin_theta = math_ops.sin(params_theta) if comp: params_phi = vs.get_variable("phi_0", [varsize], initializer=theta_phi_initializer) cos_phi = math_ops.cos(params_phi) sin_phi = math_ops.sin(params_phi) cos_list_0 = math_ops.complex(cos_theta, array_ops.zeros_like(cos_theta)) cos_list_1 = math_ops.complex(math_ops.multiply(cos_theta, cos_phi), math_ops.multiply(cos_theta, sin_phi)) sin_list_0 = math_ops.complex(sin_theta, array_ops.zeros_like(sin_theta)) sin_list_1 = math_ops.complex(-math_ops.multiply(sin_theta, cos_phi), -math_ops.multiply(sin_theta, sin_phi)) last = 0 for i in range(capacity): size = capacity - i normal_size = (hidden_size // (2 ** size)) * (2 ** (size - 1)) extra_size = max(0, (hidden_size % (2 ** size)) - (2 ** (size - 1))) if comp: cos_list_normal = array_ops.concat([array_ops.slice(cos_list_0, [last], [normal_size]), array_ops.slice(cos_list_1, [last], [normal_size])], 0) sin_list_normal = array_ops.concat([array_ops.slice(sin_list_0, [last], [normal_size]), -array_ops.slice(sin_list_1, [last], [normal_size])], 0) last += normal_size cos_list_extra = array_ops.concat([array_ops.slice(cos_list_0, [last], [extra_size]), math_ops.complex(tf.ones([hidden_size - 2*normal_size - 2*extra_size]), tf.zeros([hidden_size - 2*normal_size - 2*extra_size])), array_ops.slice(cos_list_1, [last], [extra_size])], 0) sin_list_extra = array_ops.concat([array_ops.slice(sin_list_0, [last], [extra_size]), math_ops.complex(tf.zeros([hidden_size - 2*normal_size - 2*extra_size]), tf.zeros([hidden_size - 2*normal_size - 2*extra_size])), -array_ops.slice(sin_list_1, [last], [extra_size])], 0) last += extra_size else: cos_list_normal = array_ops.slice(cos_theta, [last], [normal_size]) cos_list_normal = array_ops.concat([cos_list_normal, cos_list_normal], 0) cos_list_extra = array_ops.slice(cos_theta, [last+normal_size], [extra_size]) cos_list_extra = array_ops.concat([cos_list_extra, tf.ones([hidden_size - 2*normal_size - 2*extra_size]), cos_list_extra], 0) sin_list_normal = array_ops.slice(sin_theta, [last], [normal_size]) sin_list_normal = array_ops.concat([sin_list_normal, -sin_list_normal], 0) sin_list_extra = array_ops.slice(sin_theta, [last+normal_size], [extra_size]) sin_list_extra = array_ops.concat([sin_list_extra, tf.zeros([hidden_size - 2*normal_size - 2*extra_size]), -sin_list_extra], 0) last += normal_size + extra_size if normal_size != 0: cos_list_normal = array_ops.reshape(array_ops.transpose(array_ops.reshape(cos_list_normal, [-1, 2*normal_size//(2**size)])), [-1]) sin_list_normal = array_ops.reshape(array_ops.transpose(array_ops.reshape(sin_list_normal, [-1, 2*normal_size//(2**size)])), [-1]) cos_list = array_ops.concat([cos_list_normal, cos_list_extra], 0) sin_list = array_ops.concat([sin_list_normal, sin_list_extra], 0) diag_list_0.append(cos_list) off_list_0.append(sin_list) diag_vec = array_ops.stack(diag_list_0, 0) off_vec = array_ops.stack(off_list_0, 0) else: capacity_b = capacity//2 capacity_a = capacity - capacity_b hidden_size_a = hidden_size//2 hidden_size_b = (hidden_size-1)//2 params_theta_0 = vs.get_variable("theta_0", [capacity_a, hidden_size_a], initializer=theta_phi_initializer) cos_theta_0 = array_ops.reshape(math_ops.cos(params_theta_0), [capacity_a, -1, 1]) sin_theta_0 = array_ops.reshape(math_ops.sin(params_theta_0), [capacity_a, -1, 1]) params_theta_1 = vs.get_variable("theta_1", [capacity_b, hidden_size_b], initializer=theta_phi_initializer) cos_theta_1 = array_ops.reshape(math_ops.cos(params_theta_1), [capacity_b, -1, 1]) sin_theta_1 = array_ops.reshape(math_ops.sin(params_theta_1), [capacity_b, -1, 1]) if comp: params_phi_0 = vs.get_variable("phi_0", [capacity_a, hidden_size_a], initializer=theta_phi_initializer) cos_phi_0 = array_ops.reshape(math_ops.cos(params_phi_0), [capacity_a, -1, 1]) sin_phi_0 = array_ops.reshape(math_ops.sin(params_phi_0), [capacity_a, -1, 1]) cos_list_0_re = array_ops.reshape(array_ops.concat([cos_theta_0, math_ops.multiply(cos_theta_0, cos_phi_0)], 2), [capacity_a, -1]) cos_list_0_im = array_ops.reshape(array_ops.concat([array_ops.zeros_like(cos_theta_0), math_ops.multiply(cos_theta_0, sin_phi_0)], 2), [capacity_a, -1]) if hidden_size_a*2 != hidden_size: cos_list_0_re = array_ops.concat([cos_list_0_re, tf.ones([capacity_a, 1])], 1) cos_list_0_im = array_ops.concat([cos_list_0_im, tf.zeros([capacity_a, 1])], 1) cos_list_0 = math_ops.complex(cos_list_0_re, cos_list_0_im) sin_list_0_re = array_ops.reshape(array_ops.concat([sin_theta_0, - math_ops.multiply(sin_theta_0, cos_phi_0)], 2), [capacity_a, -1]) sin_list_0_im = array_ops.reshape(array_ops.concat([array_ops.zeros_like(sin_theta_0), - math_ops.multiply(sin_theta_0, sin_phi_0)], 2), [capacity_a, -1]) if hidden_size_a*2 != hidden_size: sin_list_0_re = array_ops.concat([sin_list_0_re, tf.zeros([capacity_a, 1])], 1) sin_list_0_im = array_ops.concat([sin_list_0_im, tf.zeros([capacity_a, 1])], 1) sin_list_0 = math_ops.complex(sin_list_0_re, sin_list_0_im) params_phi_1 = vs.get_variable("phi_1", [capacity_b, hidden_size_b], initializer=theta_phi_initializer) cos_phi_1 = array_ops.reshape(math_ops.cos(params_phi_1), [capacity_b, -1, 1]) sin_phi_1 = array_ops.reshape(math_ops.sin(params_phi_1), [capacity_b, -1, 1]) cos_list_1_re = array_ops.reshape(array_ops.concat([cos_theta_1, math_ops.multiply(cos_theta_1, cos_phi_1)], 2), [capacity_b, -1]) cos_list_1_re = array_ops.concat([tf.ones((capacity_b, 1)), cos_list_1_re], 1) cos_list_1_im = array_ops.reshape(array_ops.concat([array_ops.zeros_like(cos_theta_1), math_ops.multiply(cos_theta_1, sin_phi_1)], 2), [capacity_b, -1]) cos_list_1_im = array_ops.concat([tf.zeros((capacity_b, 1)), cos_list_1_im], 1) if hidden_size_b*2 != hidden_size-1: cos_list_1_re = array_ops.concat([cos_list_1_re, tf.ones([capacity_b, 1])], 1) cos_list_1_im = array_ops.concat([cos_list_1_im, tf.zeros([capacity_b, 1])], 1) cos_list_1 = math_ops.complex(cos_list_1_re, cos_list_1_im) sin_list_1_re = array_ops.reshape(array_ops.concat([sin_theta_1, -math_ops.multiply(sin_theta_1, cos_phi_1)], 2), [capacity_b, -1]) sin_list_1_re = array_ops.concat([tf.zeros((capacity_b, 1)), sin_list_1_re], 1) sin_list_1_im = array_ops.reshape(array_ops.concat([array_ops.zeros_like(sin_theta_1), -math_ops.multiply(sin_theta_1, sin_phi_1)], 2), [capacity_b, -1]) sin_list_1_im = array_ops.concat([tf.zeros((capacity_b, 1)), sin_list_1_im], 1) if hidden_size_b*2 != hidden_size-1: sin_list_1_re = array_ops.concat([sin_list_1_re, tf.zeros([capacity_b, 1])], 1) sin_list_1_im = array_ops.concat([sin_list_1_im, tf.zeros([capacity_b, 1])], 1) sin_list_1 = math_ops.complex(sin_list_1_re, sin_list_1_im) else: cos_list_0 = array_ops.reshape(array_ops.concat([cos_theta_0, cos_theta_0], 2), [capacity_a, -1]) sin_list_0 = array_ops.reshape(array_ops.concat([sin_theta_0, -sin_theta_0], 2), [capacity_a, -1]) if hidden_size_a*2 != hidden_size: cos_list_0 = array_ops.concat([cos_list_0, tf.ones([capacity_a, 1])], 1) sin_list_0 = array_ops.concat([sin_list_0, tf.zeros([capacity_a, 1])], 1) cos_list_1 = array_ops.reshape(array_ops.concat([cos_theta_1, cos_theta_1], 2), [capacity_b, -1]) cos_list_1 = array_ops.concat([tf.ones((capacity_b, 1)), cos_list_1], 1) sin_list_1 = array_ops.reshape(array_ops.concat([sin_theta_1, -sin_theta_1], 2), [capacity_b, -1]) sin_list_1 = array_ops.concat([tf.zeros((capacity_b, 1)), sin_list_1], 1) if hidden_size_b*2 != hidden_size-1: cos_list_1 = array_ops.concat([cos_list_1, tf.zeros([capacity_b, 1])], 1) sin_list_1 = array_ops.concat([sin_list_1, tf.zeros([capacity_b, 1])], 1) if capacity_b != capacity_a: if comp: cos_list_1 = array_ops.concat([cos_list_1, math_ops.complex(tf.zeros([1, hidden_size]), tf.zeros([1, hidden_size]))], 0) sin_list_1 = array_ops.concat([sin_list_1, math_ops.complex(tf.zeros([1, hidden_size]), tf.zeros([1, hidden_size]))], 0) else: cos_list_1 = array_ops.concat([cos_list_1, tf.zeros([1, hidden_size])], 0) sin_list_1 = array_ops.concat([sin_list_1, tf.zeros([1, hidden_size])], 0) diag_vec = tf.reshape(tf.concat([cos_list_0, cos_list_1], 1), [capacity_a*2, hidden_size]) off_vec = tf.reshape(tf.concat([sin_list_0, sin_list_1], 1), [capacity_a*2, hidden_size]) if capacity_b != capacity_a: diag_vec = tf.slice(diag_vec, [0, 0], [capacity, hidden_size]) off_vec = tf.slice(off_vec, [0, 0], [capacity, hidden_size]) def _toTensorArray(elems): elems = ops.convert_to_tensor(elems) n = array_ops.shape(elems)[0] elems_ta = tensor_array_ops.TensorArray(dtype=elems.dtype, size=n, dynamic_size=False, infer_shape=True, clear_after_read=False) elems_ta = elems_ta.unstack(elems) return elems_ta diag_vec = _toTensorArray(diag_vec) off_vec = _toTensorArray(off_vec) if comp: omega = vs.get_variable("omega", [hidden_size], initializer=theta_phi_initializer) diag = math_ops.complex(math_ops.cos(omega), math_ops.sin(omega)) else: diag = None return diag_vec, off_vec, diag, capacity
def internally_var_scoped_function(scope_name): with variable_scope.variable_scope(scope_name): return variable_scope.get_variable( "dummy", shape=[1], initializer=init_ops.zeros_initializer())
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "lstm_cell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, self._num_units]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") with _checked_scope(self, scope or "lstm_cell", initializer=self._initializer, reuse=self._reuse) as unit_scope: if self._num_unit_shards is not None: unit_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_unit_shards)) # i = input_gate, j = new_input, f = forget_gate, o = output_gate input_contributions = _linear([inputs], 4 * self._num_units, bias=True) with tf.variable_scope('projection'): mprev_projected = _linear([m_prev], self._num_proj, bias=False) with tf.variable_scope('antiprojection'): mprev_contributions = _linear([mprev_projected], 4 * self._num_units, bias=False) lstm_matrix = input_contributions + mprev_contributions i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: with vs.variable_scope(unit_scope) as projection_scope: if self._num_unit_shards is not None: projection_scope.set_partitioner(None) w_f_diag = vs.get_variable("w_f_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable("w_i_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable("w_o_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection", reuse=True) as proj_scope: if self._num_proj_shards is not None: proj_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_proj_shards)) out = _linear(m, self._num_proj, bias=False) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type out = clip_ops.clip_by_value(out, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type else: out = m new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return out, new_state
def pointer_decoder(decoder_inputs, initial_state, attention_states, cell, feed_prev=True, dtype=dtypes.float32, scope=None): """RNN decoder with pointer net for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "pointer_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) with vs.variable_scope(scope or "point_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. input_size = decoder_inputs[0].get_shape()[1].value attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) attention_vec_size = attn_size # Size of query vectors for attention. k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = vs.get_variable("AttnV", [attention_vec_size]) states = [initial_state] def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = core_rnn_cell_impl._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) return s outputs = [] prev = None batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = array_ops.zeros(batch_attn_size, dtype=dtype) attns.set_shape([None, attn_size]) inps = [] for i in range(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] if feed_prev and i > 0: inp = tf.stack(decoder_inputs) inp = tf.transpose(inp, perm=[1, 0, 2]) inp = tf.reshape(inp, [-1, attn_length, input_size]) inp = tf.reduce_sum(inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1) inp = tf.stop_gradient(inp) inps.append(inp) # Use the same inputs in inference, order internaly # Merge input and previous attentions into one vector of the right size. x = core_rnn_cell_impl._linear([inp, attns], cell.output_size, True) # Run the RNN. cell_output, new_state = cell(x, states[-1]) states.append(new_state) # Run the attention mechanism. output = attention(new_state) outputs.append(output) return outputs, states, inps
def __call__(self, inputs, state, scope=None): # vars from different layers. h_bottom, z_bottom, h_top_prev = inputs # vars from the previous time step on the same layer h_prev, z_prev = state # I'm calling the the 'z gate' in GRU the 'o gate', since z means something different in HM-LSTM. # Not including the candidate hidden state (c_tilda, or g as I call it, since it needs to be # multiplied by r first. # Need enough rows in the shared matrix for r, o, z_stochastic_tilda num_rows = 2 * self._num_units + 1 # scope: optional name for the variable scope, defaults to "HmGruCell" with vs.variable_scope(scope or type(self).__name__): # Matrix U_l^l U_curr = vs.get_variable("U_curr", [h_prev.get_shape()[1], num_rows], dtype=tf.float32) # Matrix U_{l+1}^l U_top = vs.get_variable("U_top", [h_bottom.get_shape()[1], num_rows], dtype=tf.float32) # Matrix W_{l-1}^l W_bottom = vs.get_variable("W_bottom", [h_bottom.get_shape()[1], num_rows], dtype=tf.float32) # b_l bias = vs.get_variable("bias", [num_rows], dtype=tf.float32) s_curr = tf.matmul(h_prev, U_curr) s_top = z_prev * tf.matmul(h_top_prev, U_top) s_bottom = z_bottom * tf.matmul(h_bottom, W_bottom) gate_logits = s_curr + s_top + s_bottom + bias r_logits = tf.slice(gate_logits, [0, 0], [-1, self._num_units]) o_logits = tf.slice(gate_logits, [0, self._num_units], [-1, self._num_units]) z_t_logit = tf.slice(gate_logits, [0, 2 * self._num_units], [-1, 1]) r = tf.sigmoid(r_logits) o = tf.sigmoid(o_logits) # This is the stochastic neuron z_new = binary_wrapper( z_t_logit, pass_through= False, # TODO make this true if you do slope annealing stochastic_tensor=tf.constant( True), # TODO make this false if you do slope annealing slope_tensor=None) # TODO set this if you do slope annealing # Now calculate the candidate gate (c_tilda aka g) # Matrix U_l^l (for just g) U_g_curr = vs.get_variable( "U_g_curr", [h_prev.get_shape()[1], self._num_units], dtype=tf.float32) # Matrix U_{l+1}^l (for just g) U_g_top = vs.get_variable( "U_g_top", [h_bottom.get_shape()[1], self._num_units], dtype=tf.float32) # Matrix W_{l-1}^l (for just g) W_g_bottom = vs.get_variable( "W_g_bottom", [h_bottom.get_shape()[1], self._num_units], dtype=tf.float32) # b_l (for just g) bias_g = vs.get_variable("bias_g", [self._num_units], dtype=tf.float32) s_g_curr = tf.matmul(r * h_prev, U_g_curr) s_g_top = z_prev * tf.matmul(r * h_top_prev, U_g_top) s_g_bottom = z_bottom * tf.matmul(r * h_bottom, W_g_bottom) g_logits = s_g_curr + s_g_top + s_g_bottom + bias_g g = tf.tanh(g_logits) z_zero_mask = tf.equal(z_prev, tf.zeros_like(z_prev)) copy_mask = tf.to_float( tf.logical_and(z_zero_mask, tf.equal(z_bottom, tf.zeros_like(z_bottom)))) update_mask = tf.to_float( tf.logical_and(z_zero_mask, tf.cast(z_bottom, tf.bool))) flush_mask = z_prev # TODO put this behind a test flag # tf.assert_equal(tf.reduce_sum(copy_mask + update_mask + flush_mask), # tf.reduce_sum(tf.ones_like(flush_mask))) # TODO h_flush = o * g h_update = (tf.ones_like(o) - o) * h_prev + h_flush h_new = copy_mask * h_prev + update_mask * h_update + flush_mask * h_flush return h_new, HmGruStateTuple(h_new, z_new)
def __call__(self, input_, state, scope=None): """Run one step of LSTM. Args: input_: input Tensor, 2D, batch x num_units. state: state Tensor, 2D, batch x state_size. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A 2D, batch x output_dim, Tensor representing the output of the LSTM after reading "input_" when previous state was "state". Here output_dim is: num_proj if num_proj was set, num_units otherwise. - A 2D, batch x state_size, Tensor representing the new state of LSTM after reading "input_" when previous state was "state". """ num_proj = self._num_units if self._num_proj is None else self._num_proj c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = input_.dtype with vs.variable_scope(scope or type(self).__name__): # "LSTMCell" sharded_w = _get_sharded_variable( "W", [self.input_size + num_proj, 4 * self._num_units], self._initializer, dtype, self._num_unit_shards) b = vs.get_variable( "B", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [input_, m_prev]) lstm_matrix = nn_ops.bias_add( _matmul_with_sharded_variable(cell_inputs, sharded_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable( "W_F_diag", shape=[self._num_units], initializer=self._initializer, dtype=dtype) w_i_diag = vs.get_variable( "W_I_diag", shape=[self._num_units], initializer=self._initializer, dtype=dtype) w_o_diag = vs.get_variable( "W_O_diag", shape=[self._num_units], initializer=self._initializer, dtype=dtype) if self._use_peepholes: c = (sigmoid(f + 1 + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * tanh(j)) else: c = (sigmoid(f + 1) * c_prev + sigmoid(i) * tanh(j)) if self._cell_clip is not None: c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * tanh(c) else: m = sigmoid(o) * tanh(c) if self._num_proj is not None: sharded_w_proj = _get_sharded_variable( "W_P", [self._num_units, self._num_proj], self._initializer, dtype, self._num_proj_shards) m = _matmul_with_sharded_variable(m, sharded_w_proj) return m, array_ops.concat(1, [c, m])
def decode_spectrum(encoded_spectrum, intensity_inputs, decoder_inputs_emb, keep_conv, keep_dense, scope): #~ print("decode_spectrum()") single_cell = rnn_cell.BasicLSTMCell(num_units=data_utils.num_units, state_is_tuple=True) #~ single_cell = rnn_cell.BasicRNNCell(num_units=data_utils.num_units) #~ single_cell = rnn_cell.GRUCell(num_units=data_utils.num_units) if (data_utils.num_layers > 1): cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * data_utils.num_layers) else: cell = single_cell cell = rnn_cell.DropoutWrapper(cell, input_keep_prob=keep_dense, output_keep_prob=keep_dense) with variable_scope.variable_scope(scope): # INTENSITY-Model Parameters # intensity input [128,27,2,10] # if (data_utils.FLAGS.shared): # shared-weight dense1_input_size = data_utils.num_ion * data_utils.WINDOW_SIZE dense1_output_size = 1024 # dense1_W = variable_scope.get_variable( name="dense1_W_0", shape=[dense1_input_size, dense1_output_size], initializer=tf.uniform_unit_scaling_initializer(1.43)) dense1_B = variable_scope.get_variable( name="dense1_B_0", shape=[dense1_output_size], initializer=tf.constant_initializer(0.1)) # dense_linear_W = variable_scope.get_variable( name="dense_linear_W", shape=[dense1_output_size, 1]) # dense_linear_B = variable_scope.get_variable( name="dense_linear_B", shape=[1], initializer=tf.constant_initializer(0.1)) # else: # joint-weight # conv1: [128,8,20,26] >> [128,8,20,64] with kernel [1,3,26,64] conv1_weights = tf.get_variable( name="conv1_weights", shape=[1, 3, data_utils.vocab_size, 64], initializer=tf.uniform_unit_scaling_initializer(1.43)) conv1_biases = tf.get_variable( name="conv1_biases", shape=[64], initializer=tf.constant_initializer(0.1)) # conv2: [128,8,20,64] >> [128,8,20,64] with kernel [1,2,64,64] conv2_weights = tf.get_variable( name="conv2_weights", shape=[1, 2, 64, 64], initializer=tf.uniform_unit_scaling_initializer(1.43)) conv2_biases = tf.get_variable( name="conv2_biases", shape=[64], initializer=tf.constant_initializer(0.1)) # max_pool: [128,8,20,64] >> [128,8,10,64] # dense1: # 4D >> [128,512] dense1_input_size = data_utils.num_ion * ( data_utils.WINDOW_SIZE // 2) * 64 # data_utils.vocab_size dense1_output_size = 512 dense1_weights = tf.get_variable( "dense1_weights", shape=[dense1_input_size, dense1_output_size], initializer=tf.uniform_unit_scaling_initializer(1.43)) dense1_biases = tf.get_variable( "dense1_biases", shape=[dense1_output_size], initializer=tf.constant_initializer(0.1)) # # for testing dense1_W_penalty = tf.mul(tf.nn.l2_loss(dense1_weights), data_utils.l2_loss_weight, name='dense1_W_penalty') # dense2: # [128,512] >> [128,512] #~ dense2_input_size = 512 #~ dense2_output_size = 512 #~ dense2_weights = tf.get_variable("dense2_weights", #~ shape=[dense2_input_size, dense2_output_size], #~ initializer=tf.uniform_unit_scaling_initializer(1.43)) #~ dense2_biases = tf.get_variable("dense2_biases", shape=[dense2_output_size], initializer=tf.constant_initializer(0.1)) # logit_linear: [128,512] >> [128,27] #~ linear_input_size = 512 #~ linear_output_size = data_utils.vocab_size #~ linear_weights = tf.get_variable("linear_weights", #~ shape=[linear_input_size, linear_output_size]) #~ linear_biases = tf.get_variable("linear_biases", shape=[linear_output_size], initializer=tf.constant_initializer(0.0)) # LSTM-Intensity Connection-Model Parameters # #~ denseL_W = variable_scope.get_variable(name="denseL_W",shape=[data_utils.vocab_size,data_utils.vocab_size], #~ initializer=tf.uniform_unit_scaling_initializer(1.43)) #~ denseI_W = variable_scope.get_variable(name="denseI_W",shape=[data_utils.vocab_size,data_utils.vocab_size], #~ initializer=tf.uniform_unit_scaling_initializer(1.43)) #~ denseC_B = variable_scope.get_variable(name="denseC_B",shape=[data_utils.vocab_size], #~ initializer=tf.constant_initializer(0.1)) # cat dense_concat_W = variable_scope.get_variable( name="dense_concat_W", shape=[512 + 512, 512], initializer=tf.uniform_unit_scaling_initializer(1.43)) dense_concat_B = variable_scope.get_variable( name="dense_concat_B", shape=[512], initializer=tf.constant_initializer(0.1)) # DECODING - SPECTRUM as Input 0 with variable_scope.variable_scope("LSTM_cell"): # input0 = encoded_spectrum # batch_size = array_ops.shape(input0)[0] zero_state = cell.zero_state(batch_size=batch_size, dtype=tf.float32) # #~ _, lstm_state = cell(inputs=input0,state=zero_state) # nobi _, lstm_state_0 = cell(inputs=input0, state=zero_state) # nobi # DECODING - lstm_input_projected with variable_scope.variable_scope("LSTM_input_projected"): lstm_input_projected_W = variable_scope.get_variable( name="lstm_input_projected_W", shape=[data_utils.embedding_size, data_utils.num_units]) # lstm_input_projected_B = variable_scope.get_variable( name="lstm_input_projected_B", shape=[data_utils.num_units], initializer=tf.constant_initializer(0.1)) # DECODING LOOP # nobi outputs = [] AA_1 = decoder_inputs_emb[0] # padding [AA_1, AA_2, ?] with GO/EOS for i, AA_2 in enumerate(decoder_inputs_emb): # nobi if (i > 0 ): # to-do-later: bring variable definitions out of the loop variable_scope.get_variable_scope().reuse_variables() # INTENSITY-Model candidate_intensity = intensity_inputs[i] # [128,27,2,10] # if (data_utils.FLAGS.shared): # shared-weight candidate_intensity_reshape = tf.reshape( candidate_intensity, shape=[-1, dense1_input_size]) # [128*27,2*10] # layer_dense1_input = candidate_intensity_reshape # layer_dense1 = tf.nn.relu( tf.matmul(layer_dense1_input, dense1_W) + dense1_B) # [128*27,1024] # layer_dense1_drop = tf.nn.dropout(layer_dense1, keep_dense) # layer_dense1_output = tf.matmul( layer_dense1_drop, dense_linear_W) + dense_linear_B # [128*27,1] # # Intensity output intensity_output = tf.reshape(layer_dense1_output, shape=[ -1, data_utils.vocab_size ]) # [128,27] # else: # joint-weight # image_batch: [128,26,8,20] >> [128,8,20,26] # This is a bug, should be fixed at the input processing later. image_batch = tf.transpose(candidate_intensity, perm=[0, 2, 3, 1]) # [128,8,20,26] # conv1: [128,8,20,26] >> [128,8,20,64] with kernel [1,3,26,64] conv1 = tf.nn.relu( tf.nn.conv2d(image_batch, conv1_weights, strides=[1, 1, 1, 1], padding='SAME') + conv1_biases) # conv2: [128,8,20,64] >> [128,8,20,64] with kernel [1,2,64,64] conv2 = tf.nn.relu( tf.nn.conv2d(conv1, conv2_weights, strides=[1, 1, 1, 1], padding='SAME') + conv2_biases) conv2 = tf.nn.max_pool(conv2, ksize=[1, 1, 3, 1], strides=[1, 1, 2, 1], padding='SAME') # [128,8,10,64] conv2 = tf.nn.dropout(conv2, keep_conv) # dense1: 4D >> [128,512] dense1_input = tf.reshape( conv2, [-1, dense1_input_size]) # 2D flatten dense1 = tf.nn.relu( tf.matmul(dense1_input, dense1_weights) + dense1_biases) # [128,512] # dense2: # [128,512] >> [128,512] #~ dense2 = tf.nn.relu(tf.matmul(dense1, dense2_weights) + dense2_biases) # [128,512] #~ dropout1 = tf.nn.dropout(dense2, keep_dense, name="dropout1") dropout1 = tf.nn.dropout(dense1, keep_dense, name="dropout1") # logit_linear: [128,512] >> [128,27] #~ intensity_output = tf.add(tf.matmul(dropout1, linear_weights), linear_biases) # [128,27] intensity_output = dropout1 intensity_output_projected = rnn_cell._linear( intensity_output, data_utils.vocab_size, # [128,27] bias=True, bias_start=0.1, scope="intensity_output_projected") # nobi # LSTM-Model AA_1_projected = tf.matmul( AA_1, lstm_input_projected_W) + lstm_input_projected_B AA_2_projected = tf.matmul( AA_2, lstm_input_projected_W) + lstm_input_projected_B # with variable_scope.variable_scope("LSTM_cell"): # variable_scope.get_variable_scope().reuse_variables() # _, lstm_state_1 = cell(inputs=AA_1_projected, state=lstm_state_0) lstm_output, _ = cell(inputs=AA_2_projected, state=lstm_state_1) # AA_1 = AA_2 # lstm_output_projected = rnn_cell._linear( lstm_output, data_utils.vocab_size, # [128,27] bias=True, bias_start=0.1, scope="lstm_output_projected") # LSTM-Intensity Connection-Model >> OUTPUT # if (data_utils.FLAGS.use_intensity and data_utils.FLAGS.use_lstm): # #~ output_logit = tf.nn.relu(tf.matmul(lstm_output_projected,denseL_W) + #~ tf.matmul(intensity_output_projected,denseI_W) + #~ denseC_B) # # cat concat = tf.concat(concat_dim=1, values=[intensity_output, lstm_output]) concat_dense = tf.nn.relu( tf.matmul(concat, dense_concat_W) + dense_concat_B) concat_drop = tf.nn.dropout(concat_dense, keep_dense) # output_logit = rnn_cell._linear( concat_drop, data_utils.vocab_size, # [128,27] bias=True, bias_start=0.1, scope="concat_output_projected") # elif (data_utils.FLAGS.use_intensity): # intensity only (without LSTM >> up to 10% loss, especially at AA-accuracy?) output_logit = intensity_output_projected # elif (data_utils.FLAGS.use_lstm): output_logit = lstm_output_projected # else: print("ERROR: wrong LSTM-Intensity model specified!") sys.exit() # outputs.append(output_logit) return (outputs, dense1_W_penalty)
def __call__(self, inputs, state, scope=None): # vars from different layers. h_bottom, z_bottom, h_top_prev = inputs # vars from the previous time step on the same layer c_prev, h_prev, z_prev = state # Need enough rows in the shared matrix for f, i, o, g, z_stochastic_tilda num_rows = 4 * self._num_units + 1 # scope: optional name for the variable scope, defaults to "HmLstmCell" with vs.variable_scope(scope or type(self).__name__): # "HmLstmCell" # Matrix U_l^l U_curr = vs.get_variable("U_curr", [h_prev.get_shape()[1], num_rows], dtype=tf.float32) # Matrix U_{l+1}^l # TODO This imples that the U matrix there has the same dimensionality as the # one used in equation 5. but that would only be true if you forced the h vectors # on the above layer to be equal in size to the ones below them. Is that a real restriction? # Or am I misunderstanding? U_top = vs.get_variable("U_top", [h_bottom.get_shape()[1], num_rows], dtype=tf.float32) # Matrix W_{l-1}^l W_bottom = vs.get_variable("W_bottom", [h_bottom.get_shape()[1], num_rows], dtype=tf.float32) # b_l bias = vs.get_variable("bias", [num_rows], dtype=tf.float32) s_curr = tf.matmul(h_prev, U_curr) s_top = z_prev * tf.matmul(h_top_prev, U_top) s_bottom = z_bottom * tf.matmul(h_bottom, W_bottom) gate_logits = s_curr + s_top + s_bottom + bias f_logits = tf.slice(gate_logits, [0, 0], [-1, self._num_units]) i_logits = tf.slice(gate_logits, [0, self._num_units], [-1, self._num_units]) o_logits = tf.slice(gate_logits, [0, 2 * self._num_units], [-1, self._num_units]) g_logits = tf.slice(gate_logits, [0, 3 * self._num_units], [-1, self._num_units]) z_t_logit = tf.slice(gate_logits, [0, 4 * self._num_units], [-1, 1]) f = tf.sigmoid(f_logits) i = tf.sigmoid(i_logits) o = tf.sigmoid(o_logits) g = tf.tanh(g_logits) # This is the stochastic neuron z_new = binary_wrapper( z_t_logit, pass_through= False, # TODO make this true if you do slope annealing stochastic_tensor=tf.constant( True), # TODO make this false if you do slope annealing slope_tensor=None) # TODO set this if you do slope annealing z_zero_mask = tf.equal(z_prev, tf.zeros_like(z_prev)) copy_mask = tf.to_float( tf.logical_and(z_zero_mask, tf.equal(z_bottom, tf.zeros_like(z_bottom)))) update_mask = tf.to_float( tf.logical_and(z_zero_mask, tf.cast(z_bottom, tf.bool))) flush_mask = z_prev # TODO put this behind a test flag # tf.assert_equal(tf.reduce_sum(copy_mask + update_mask + flush_mask), # tf.reduce_sum(tf.ones_like(flush_mask))) # TODO c_flush = i * g c_update = f * c_prev + c_flush c_new = copy_mask * c_prev + update_mask * c_update + flush_mask * c_flush h_flush = o * tf.tanh(c_flush) h_update = o * tf.tanh(c_update) h_new = copy_mask * h_prev + update_mask * h_update + flush_mask * h_flush state_new = HmLstmStateTuple(c_new, h_new, z_new) return h_new, state_new
def _get_variable(name, shape, initializer): return variable_scope.get_variable(name, shape=shape, initializer=initializer, dtype=dataType)
def _create_attention_score_fn(name, num_units, attention_option, reuse, dtype=dtypes.float32): """Different ways to compute attention scores. Args: name: to label variables. num_units: hidden state dimension. attention_option: how to compute attention, either "luong" or "bahdanau". "bahdanau": additive (Bahdanau et al., ICLR'2015) "luong": multiplicative (Luong et al., EMNLP'2015) reuse: whether to reuse variable scope. dtype: (default: `dtypes.float32`) data type to use. Returns: attention_score_fn: to compute similarity between key and target states. """ with variable_scope.variable_scope(name, reuse=reuse): if attention_option == "bahdanau": query_w = variable_scope.get_variable("attnW", [num_units, num_units], dtype=dtype) score_v = variable_scope.get_variable("attnV", [num_units], dtype=dtype) def attention_score_fn(query, keys, values): """Put attention masks on attention_values using attention_keys and query. Args: query: A Tensor of shape [batch_size, num_units]. keys: A Tensor of shape [batch_size, attention_length, num_units]. values: A Tensor of shape [batch_size, attention_length, num_units]. Returns: context_vector: A Tensor of shape [batch_size, num_units]. Raises: ValueError: if attention_option is neither "luong" or "bahdanau". """ if attention_option == "bahdanau": # transform query query = math_ops.matmul(query, query_w) # reshape query: [batch_size, 1, num_units] query = array_ops.reshape(query, [-1, 1, num_units]) # attn_fun scores = _attn_add_fun(score_v, keys, query) elif attention_option == "luong": # reshape query: [batch_size, 1, num_units] query = array_ops.reshape(query, [-1, 1, num_units]) # attn_fun scores = _attn_mul_fun(keys, query) else: raise ValueError("Unknown attention option %s!" % attention_option) # Compute alignment weights # scores: [batch_size, length] # alignments: [batch_size, length] # TODO(thangluong): not normalize over padding positions. alignments = nn_ops.softmax(scores) # Now calculate the attention-weighted vector. alignments = array_ops.expand_dims(alignments, 2) context_vector = math_ops.reduce_sum(alignments * values, [1]) context_vector.set_shape([None, num_units]) return context_vector return attention_score_fn
def f(): x = variable_scope.get_variable( 'v', initializer=constant_op.constant(1.0)) return x * constant_op.constant(2.0)
def encode_spectrum(encoder_inputs, intensity_inputs_forward, intensity_inputs_backward, decoder_inputs_forward, decoder_inputs_backward, keep_conv, keep_dense): #~ print("encode_spectrum()") with variable_scope.variable_scope("embedding_rnn_seq2seq"): # spectra_holder layer0 = tf.reshape(encoder_inputs[0], [-1, 1, data_utils.MZ_SIZE, 1]) # conv1 conv1_W = variable_scope.get_variable( name="conv1_W", shape=[1, 4, 1, 4], initializer=tf.uniform_unit_scaling_initializer(1.43)) conv1_B = variable_scope.get_variable( name="conv1_B", shape=[4], initializer=tf.constant_initializer(0.1)) # # conv2 conv2_W = variable_scope.get_variable( name="conv2_W", shape=[1, 4, 4, 4], initializer=tf.uniform_unit_scaling_initializer(1.43)) conv2_B = variable_scope.get_variable( name="conv2_B", shape=[4], initializer=tf.constant_initializer(0.1)) # # pool1 [1,1,4,1] # #~ # conv3 #~ conv3_W = variable_scope.get_variable(name="conv3_W", shape=[1,4,4,4], #~ initializer=tf.uniform_unit_scaling_initializer(1.43)) #~ conv3_B = variable_scope.get_variable(name="conv3_B", shape=[4], #~ initializer=tf.constant_initializer(0.1)) #~ # #~ # pool2 [1,1,4,1] # # dense1 dense1_input_size = 1 * (data_utils.MZ_SIZE // (4)) * 4 dense1_output_size = 512 dense1_W = variable_scope.get_variable( name="dense1_W", shape=[dense1_input_size, dense1_output_size], initializer=tf.uniform_unit_scaling_initializer(1.43)) dense1_B = variable_scope.get_variable( name="dense1_B", shape=[dense1_output_size], initializer=tf.constant_initializer(0.1)) # # dense2 #~ dense2_input_size = dense1_output_size #~ dense2_output_size = 512 #~ dense2_W = variable_scope.get_variable(name="dense2_W", shape=[dense2_input_size, dense2_output_size], #~ initializer=tf.uniform_unit_scaling_initializer(1.43)) #~ dense2_B = variable_scope.get_variable(name="dense2_B", shape=[dense2_output_size], #~ initializer=tf.constant_initializer(0.1)) # layers conv1 = tf.nn.relu( tf.nn.conv2d(layer0, conv1_W, strides=[1, 1, 1, 1], padding='SAME') + conv1_B) # conv2 = tf.nn.relu( tf.nn.conv2d(conv1, conv2_W, strides=[1, 1, 1, 1], padding='SAME') + conv2_B) conv2 = tf.nn.max_pool(conv2, ksize=[1, 1, 6, 1], strides=[1, 1, 4, 1], padding='SAME') conv2 = tf.nn.dropout(conv2, keep_conv) # #~ conv3 = tf.nn.relu(tf.nn.conv2d(conv2, conv3_W, strides=[1,1,1,1], padding='SAME') + conv3_B) #~ conv3 = tf.nn.max_pool(conv3, ksize=[1,1,6,1], strides=[1,1,4,1], padding='SAME') #~ conv3 = tf.nn.dropout(conv3, keep_conv) # dense1 = tf.reshape(conv2, [-1, dense1_input_size]) dense1 = tf.nn.relu(tf.matmul(dense1, dense1_W) + dense1_B) dense1 = tf.nn.dropout(dense1, keep_dense) # #~ dense2 = tf.nn.relu(tf.matmul(dense1, dense2_W) + dense2_B) #~ dense2 = tf.nn.dropout(dense2, keep_dense) # SPECTRUM as Input 0 # encoded_spectrum = dense1 #~ # #~ encoded_spectrum = tf.zeros(shape=array_ops.shape(layer_dense1_drop)) return embed_labels(encoded_spectrum, intensity_inputs_forward, intensity_inputs_backward, decoder_inputs_forward, decoder_inputs_backward, keep_conv, keep_dense)