def testWeightSpecificSparsity(self): param_list = [ "begin_pruning_step=1", "pruning_frequency=1", "end_pruning_step=100", "target_sparsity=0.5", "weight_sparsity_map=[layer1:0.6,layer2/weights:0.75,.*kernel:0.6]", "threshold_decay=0.0" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) with tf.variable_scope("layer1"): w1 = tf.Variable(tf.linspace(1.0, 100.0, 100), name="weights") _ = pruning.apply_mask(w1) with tf.variable_scope("layer2"): w2 = tf.Variable(tf.linspace(1.0, 100.0, 100), name="weights") _ = pruning.apply_mask(w2) with tf.variable_scope("layer3"): w3 = tf.Variable(tf.linspace(1.0, 100.0, 100), name="kernel") _ = pruning.apply_mask(w3) p = pruning.Pruning(pruning_hparams) mask_update_op = p.conditional_mask_update_op() increment_global_step = tf.assign_add(self.global_step, 1) with self.cached_session() as session: tf.global_variables_initializer().run() for _ in range(110): session.run(mask_update_op) session.run(increment_global_step) self.assertAllClose(session.run(pruning.get_weight_sparsity()), [0.6, 0.75, 0.6])
def test_step(self): """Tests grafting of Adam and SGD steps. Derivation of one step of Adam and SGD: Gradient value is [2,4]. Adam Derivation: Lr_1 = 0.5(1-0.6)^(0.5)/(1-0.5) = 0.63245553203 - Does not matter m_1 = 0.5*G = [1,2] v_1 = 0.4*G^2 = [1.6,6.4] AdamStep = Lr_1*m_1/(sqrt{v_1}+eps) = [0.5, 0.5] Normalized AdamStep = [1.0, 1.0] SGDStep = [0.6, 1.2] Norm = [0.6, 1.2] TotalStep = 0.9*[0.6, 1.2] NewVar = [1.46, 1.92] """ opt1 = tf.train.GradientDescentOptimizer(0.3) opt2 = tf.train.AdamOptimizer(0.5, beta1=0.5, beta2=0.6) opt = adagraft.AdaGraftOptimizer(0.9, opt1, opt2) with self.cached_session() as sess: var0 = tf.Variable(2.0, name="var0") var1 = tf.Variable(3.0, name="var1") loss = (var0 - 1) * (var0 - 1) + (var1 - 1) * (var1 - 1) o = opt.minimize(loss) self.evaluate(tf.global_variables_initializer()) correct_values = [[1.058, 1.46, 1.92], [0.22387284, 1.2116001, 1.4232]] for i in range(2): sess.run(o) step_values = sess.run([loss, var0, var1]) print(step_values) self.assertAllClose(correct_values[i], step_values)
def testConditionalMaskUpdate(self): param_list = [ "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6", "nbins=100" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) weights = tf.Variable(tf.linspace(1.0, 100.0, 100), name="weights") masked_weights = pruning.apply_mask(weights) sparsity = tf.Variable(0.00, name="sparsity") # Set up pruning p = pruning.Pruning(pruning_hparams, sparsity=sparsity) p._spec.threshold_decay = 0.0 mask_update_op = p.conditional_mask_update_op() sparsity_val = tf.linspace(0.0, 0.9, 10) increment_global_step = tf.assign_add(self.global_step, 1) non_zero_count = [] with self.cached_session() as session: tf.global_variables_initializer().run() for i in range(10): session.run(tf.assign(sparsity, sparsity_val[i])) session.run(mask_update_op) session.run(increment_global_step) non_zero_count.append(np.count_nonzero(masked_weights.eval())) # Weights pruned at steps 0,2,4,and,6 expected_non_zero_count = [100, 100, 80, 80, 60, 60, 40, 40, 40, 40] self.assertAllEqual(expected_non_zero_count, non_zero_count)
def testBfloat16Reload(self): checkpoint_path = os.path.join(self.get_temp_dir(), "bfloat16_restore") # Create a resource variable of type tf.float32 and save them to disk. g_for_save_graph = tf.Graph() fl = 0.99 with self.session(graph=g_for_save_graph) as sess: v0 = tf.Variable(fl, name="v0", dtype=tf.float32, use_resource=True) self.evaluate(tf.global_variables_initializer()) self.assertAlmostEqual(fl, v0.eval()) saver = tf.train.Saver({ "v0": v0, }, restore_sequentially=True) val = saver.save(sess, checkpoint_path) self.assertEqual(checkpoint_path, val) # Restore the variable as bfloat16. g_for_restore_graph = tf.Graph() with self.session(graph=g_for_restore_graph) as sess: v0 = tf.Variable(0.0, name="v0", dtype=tf.bfloat16, use_resource=True) self.evaluate(tf.global_variables_initializer()) self.assertAlmostEqual(0.0, v0.eval()) saveable = bfloat16_variables.Bfloat16VariableSaveable( v0, tf.float32, "", "v0") saver = tf.train.Saver({"v0": saveable}, restore_sequentially=True) saver.restore(sess, checkpoint_path) self.assertAlmostEqual(fl, v0.eval(), places=2)
def _finish(self, update_ops, name_scope): with tf.control_dependencies(update_ops): ops1 = self.magnitude_optimizer._finish([], name_scope + "_m") # pylint: disable=protected-access ops2 = self.direction_optimizer._finish([], name_scope + "_d") # pylint: disable=protected-access if self.use_global_norm: # apply global grafting with tf.control_dependencies([ops1, ops2]): m_global_norm = tf.Variable(0.) d_global_norm = tf.Variable(0.) for var in self._variables: m_step_norm = self.get_slot(var, "m_step_norm") d_step_norm = self.get_slot(var, "d_step_norm") tf.assign_add(m_global_norm, m_step_norm**2) tf.assign_add(d_global_norm, d_step_norm**2) multiplier = tf.sqrt(m_global_norm / tf.maximum(d_global_norm, 1e-30)) step_ops = [] for var in self._variables: d_step = self.get_slot(var, "scratch_copy") step = tf.where(tf.greater(d_step_norm, 0), multiplier * d_step, tf.zeros_like(d_step)) step_op = tf.assign_add( var, self._learning_rate_tensor * step) step_ops.append(step_op) return tf.group(*step_ops, name=name_scope) return tf.group(*([ops1, ops2] + update_ops), name=name_scope)
def testUpdateSingleMask(self): with self.cached_session() as session: weights = tf.Variable(tf.linspace(1.0, 100.0, 100), name="weights") masked_weights = pruning.apply_mask(weights) sparsity = tf.Variable(0.95, name="sparsity") p = pruning.Pruning(sparsity=sparsity) p._spec.threshold_decay = 0.0 mask_update_op = p.mask_update_op() tf.global_variables_initializer().run() masked_weights_val = masked_weights.eval() self.assertAllEqual(np.count_nonzero(masked_weights_val), 100) session.run(mask_update_op) masked_weights_val = masked_weights.eval() self.assertAllEqual(np.count_nonzero(masked_weights_val), 5)
def testPerLayerBlockSparsity(self): param_list = [ "block_dims_map=[layer1/weights:1x1,layer2/weights:1x2]", "block_pooling_function=AVG", "threshold_decay=0.0" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) with tf.variable_scope("layer1"): w1 = tf.constant([[-0.1, 0.1], [-0.2, 0.2]], name="weights") pruning.apply_mask(w1) with tf.variable_scope("layer2"): w2 = tf.constant([[0.1, 0.1, 0.3, 0.3], [0.2, 0.2, 0.4, 0.4]], name="weights") pruning.apply_mask(w2) sparsity = tf.Variable(0.5, name="sparsity") p = pruning.Pruning(pruning_hparams, sparsity=sparsity) mask_update_op = p.mask_update_op() with self.cached_session() as session: tf.global_variables_initializer().run() session.run(mask_update_op) mask1_eval = session.run(pruning.get_masks()[0]) mask2_eval = session.run(pruning.get_masks()[1]) self.assertAllEqual(session.run(pruning.get_weight_sparsity()), [0.5, 0.5]) self.assertAllEqual(mask1_eval, [[0.0, 0.0], [1., 1.]]) self.assertAllEqual(mask2_eval, [[0, 0, 1., 1.], [0, 0, 1., 1.]])
def _blockMasking(self, hparams, weights, expected_mask): threshold = tf.Variable(0.0, name="threshold") sparsity = tf.Variable(0.5, name="sparsity") test_spec = ",".join(hparams) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) # Set up pruning p = pruning.Pruning(pruning_hparams, sparsity=sparsity) with self.cached_session(): tf.global_variables_initializer().run() _, new_mask = p._maybe_update_block_mask(weights, threshold) # Check if the mask is the same size as the weights self.assertAllEqual(new_mask.get_shape(), weights.get_shape()) mask_val = new_mask.eval() self.assertAllEqual(mask_val, expected_mask)
def setUp(self): super(PruningHParamsTest, self).setUp() # Add global step variable to the graph self.global_step = tf.train.get_or_create_global_step() # Add sparsity self.sparsity = tf.Variable(0.5, name="sparsity") # Parse hparams self.pruning_hparams = pruning.get_pruning_hparams().parse( self.TEST_HPARAMS)
def test_identity(self): # AdaGraft(1, opt, opt) should do the same thing as opt. opt1 = tf.train.AdamOptimizer(0.5, beta1=0.5, beta2=0.5) opt2 = tf.train.AdamOptimizer(0.5, beta1=0.5, beta2=0.5) opt3 = tf.train.AdamOptimizer(0.5, beta1=0.5, beta2=0.5) opt = adagraft.AdaGraftOptimizer(1.0, opt1, opt2) with self.cached_session() as sess: var0 = tf.Variable(2.0, name="var0") var1 = tf.Variable(3.0, name="var1") loss = (var0 - 1) * (var0 - 1) + (var1 - 1) * (var1 - 1) o = opt.minimize(loss) oo = opt3.minimize(loss) self.evaluate(tf.global_variables_initializer()) sess.run(o) l1 = sess.run([loss, var0, var1]) print(l1) sess.run([tf.assign(var0, 2.0), tf.assign(var1, 3.0)]) sess.run(oo) l2 = sess.run([loss, var0, var1]) print(l2) self.assertAllClose(l1, l2)
def testCreateMask2D(self): width = 10 height = 20 with self.cached_session(): weights = tf.Variable(tf.random_normal([width, height], stddev=1), name="weights") masked_weights = pruning.apply_mask(weights, tf.get_variable_scope()) tf.global_variables_initializer().run() weights_val = weights.eval() masked_weights_val = masked_weights.eval() self.assertAllEqual(weights_val, masked_weights_val)
def __init__(self, output_dim=768): # Counts the number of times the layer has been run with training=True. self.counter = tf.Variable(initial_value=0, dtype=tf.int32, name='counter', use_resource=True) self.output_dim = output_dim # "Reusable" SavedModel metadata expected by KerasLayer. self.variables = [self.counter] self.trainable_variables = [] self.regularization_losses = []
def testKerasInterfaceLayer(self): layer = self.params.Instantiate() layer.dense = layer.AddVariable( tf.keras.layers.Dense(20), input_shape=(10, 20)) layer.gru = layer.AddVariable( tf.keras.layers.GRU( 256, return_sequences=True, return_state=True, recurrent_initializer="glorot_uniform"), input_shape=(10, 20, 30)) layer.emb = layer.AddVariable( tf.keras.layers.Embedding(20, 30), input_shape=(10,)) layer.emb2 = layer.AddVariable( tf.keras.layers.Embedding(20, 30), input_shape=(10,), keras_scope="boerenkaas") layer.var = layer.AddVariable(tf.Variable([[1.], [2.]])) layer.var2 = layer.AddVariable(tf.Variable([[1.], [2.]], name="foo")) self.assertSameElements( layer.activated_var_names, [ "bias", # From Dense "embeddings", "boerenkaas/embeddings", "foo", "gru_cell/kernel", "gru_cell/recurrent_kernel", "gru_cell/bias", "kernel", # From Dense "Variable" ]) # Verifying that these work as intended: _ = layer.dense(tf.zeros([10, 20])) _ = layer.emb(tf.zeros([10])) _ = layer.gru(tf.zeros([10, 20, 30]))
def testPartitionedVariableMasking(self): partitioner = tf.variable_axis_size_partitioner(40) with self.cached_session() as session: with tf.variable_scope("", partitioner=partitioner): sparsity = tf.Variable(0.5, name="Sparsity") weights = tf.get_variable("weights", initializer=tf.linspace( 1.0, 100.0, 100)) masked_weights = pruning.apply_mask( weights, scope=tf.get_variable_scope()) p = pruning.Pruning(sparsity=sparsity) p._spec.threshold_decay = 0.0 mask_update_op = p.mask_update_op() tf.global_variables_initializer().run() masked_weights_val = masked_weights.eval() session.run(mask_update_op) masked_weights_val = masked_weights.eval() self.assertAllEqual(np.count_nonzero(masked_weights_val), 50)
def testDenseLayerSigns(self): """EG-DD update.""" with self.cached_session() as sess: var = tf.Variable([0.5, 1.0]) grad = tf.placeholder(tf.float32, shape=[2]) opt = egdd.EGDD(learning_rate=0.1, momentum=0.9, beta=0.1, gain_learning_rate=1e-2, scale_learning_rate=1e-3, use_signs=True) step = opt.apply_gradients([(grad, var)]) tf.global_variables_initializer().run() pre_var = sess.run(var) pre_momentum = sess.run(opt.get_slot(var, 'momentum')) pre_gain = sess.run(opt.get_slot(var, 'gain')) pre_lr_scale = sess.run(opt.get_slot(var, 'lr_scale')) self.assertAllClose([0.5, 1.0], pre_var) self.assertAllClose([0.0, 0.0], pre_momentum) self.assertAllClose([1.0, 1.0], pre_gain) self.assertAllClose([1.0], pre_lr_scale) sess.run(step, feed_dict={grad: [0.1, -0.5]}) pre_var = sess.run(var) pre_momentum = sess.run(opt.get_slot(var, 'momentum')) pre_gain = sess.run(opt.get_slot(var, 'gain')) pre_lr_scale = sess.run(opt.get_slot(var, 'lr_scale')) self.assertAllClose([0.49, 1.05], pre_var) self.assertAllClose([0.01, -0.05], pre_momentum) self.assertAllClose([1, 1], pre_gain) self.assertAllClose([1.0], pre_lr_scale) sess.run(step, feed_dict={grad: [-1.0, -1.5]}) pre_var = sess.run(var) pre_momentum = sess.run(opt.get_slot(var, 'momentum')) pre_gain = sess.run(opt.get_slot(var, 'gain')) pre_lr_scale = sess.run(opt.get_slot(var, 'lr_scale')) self.assertAllClose([0.5801, 1.2466], pre_var, atol=1e-4) self.assertAllClose([-0.0900, -0.1965], pre_momentum, atol=1e-4) self.assertAllClose([0.9900, 1.0101], pre_gain, atol=1e-4) self.assertAllClose([1.0007], pre_lr_scale, atol=1e-4)
def _SaveAsync(self, sess): """Saves the graph asynchronously. All the variables are first copied, synchronously, in memory to another set of vars, and then the saving to disk is done in a different thread. The function blocks till the previous saving is done. Args: sess: A session with tf.Graph under which this object is constructed. Returns: Returns the global step and file prefix. """ if self._async_save_thread is not None: # Waiting for the previous save to finish. self._async_save_thread.join() if self._async_exception is not None: e = self._async_exception self._async_exception = None raise e if self._copied_vars is None: # Creating the first copy of the vars. Doing it here and not in the # constructor due to initialization sequence of TF. self._copied_vars = [] # Note: the variables below will be created in self._var_graph regardless # which graph is set as default, so we need to apply the device context in # self._var_graph. copying_ops = [] with self._var_graph.as_default(): for v in self._vars: with self._var_graph.device(v.device): copied_v = tf.Variable(v, trainable=False) assert copied_v.graph is v.graph assert copied_v.device == v.device self._copied_vars.append(copied_v) copying_ops.append(copied_v.assign(v)) # Group the ops to avoid running them directly, which will generate # expensive send/recv operations. self._copying_op = tf.group(*copying_ops) sess.run(self._copying_op) global_step, prefix = sess.run( fetches=[self._save_global_step, self._save_prefix], feed_dict={self._logdir_ph: self._logdir}) tf.logging.info("Saving asynchronously to %s", tf.compat.as_text(prefix)) def _Async(prefix): checkpoint_start_time = time.perf_counter() try: copied_var_map = { id(copied_var): var for copied_var, var in zip(self._copied_vars, self._vars) } save_op = self._AddShardedSaveOps( self._copied_vars, prefix, lambda copied_var: _VarKey(copied_var_map[id(copied_var)])) _ = sess.run(fetches=[save_op], feed_dict={}) # Many users expect this as the tf.train.Saver does this by default. prefix = tf.compat.as_text(prefix) self._FinalizeSave(global_step, prefix) except Exception as e: # pylint: disable=broad-except self._async_exception = e _async_checkpoint_op_time_seconds.get_cell().add( time.perf_counter() - checkpoint_start_time) self._async_save_thread = threading.Thread(target=_Async, args=(prefix, )) self._async_save_thread.start() return global_step, tf.compat.as_text(prefix)
def _parse_record_stateful(record): del record extra = tf.Variable(0) example = py_utils.NestedMap(t=extra.value()) bucketing_key = 1 return example, bucketing_key
def testShampooWithMatrixShapedTensors(self): # Parameter matrix of size [4,2] would result in L_{t}, and R_{t} of # sizes [4, 4] and [2, 2] size = [4, 2] init_var_np = np.zeros(size) # Initialize gradient as random tensor. grad_np = np.random.rand(size[0], size[1]) with tf.Session(): global_step = tf.Variable(0, dtype=tf.int64) var = tf.Variable(init_var_np, dtype=tf.float32) grad = tf.constant(grad_np, dtype=tf.float32) opt = distributed_shampoo.DistributedShampoo( learning_rate=1.0, momentum=0.0, start_preconditioning_steps=0, synchronous_preconditioning=True, global_step=global_step) # Run a single step of gradient update. update = opt.apply_gradients(zip([grad], [var]), global_step=global_step) # Preconditioner computation and assignments to variables. compute_preconditioner_op = opt.invoke_async_preconditioner_computation( tf.cast(global_step, tf.int32)) assign_preconditioners_to_vars_op = ( opt.assign_preconditioner_to_host_vars()) self.evaluate(tf.global_variables_initializer()) tf.tables_initializer().run() init_val = self.evaluate(var) self.assertAllCloseAccordingToType(init_var_np, init_val) def np_power(mat_g, alpha, matrix_epsilon=1e-6): """Computes mat_g^alpha for a square symmetric matrix mat_g.""" mat_for_svd = mat_g + np.eye(mat_g.shape[0]) * matrix_epsilon mat_u, diag_d, mat_v = np.linalg.svd(mat_for_svd, full_matrices=True) diag_d = np.power(np.maximum(diag_d, matrix_epsilon), alpha) return np.dot(mat_u, np.dot(np.diag(diag_d), mat_v)) def norm(val): return np.sqrt(np.sum(np.square(val))) # Run a step of preconditioner update. update.run() mat_g1 = np.dot(grad_np, grad_np.transpose()) expected_mat_g1 = self.evaluate( opt.get_slot(var, 'mat_statistics_0')) self.assertAllCloseAccordingToType(mat_g1, expected_mat_g1, atol=1e-1) mat_g2 = np.dot(grad_np.transpose(), grad_np) expected_mat_g2 = self.evaluate( opt.get_slot(var, 'mat_statistics_1')) self.assertAllCloseAccordingToType(mat_g2, expected_mat_g2, atol=1e-1) compute_preconditioner_op.run() assign_preconditioners_to_vars_op.run() mat_left = np_power(mat_g1, -0.25) expected_mat_left = self.evaluate( opt.get_slot(var, 'mat_preconditioner_0')) self.assertAllCloseAccordingToType(mat_left, expected_mat_left, atol=1e-1) mat_right = np_power(mat_g2, -0.25) expected_mat_right = self.evaluate( opt.get_slot(var, 'mat_preconditioner_1')) self.assertAllCloseAccordingToType(mat_right, expected_mat_right, atol=1e-1) # As the preconditioners are initialized to all zero. We don't make # any update. var_step_0_val = self.evaluate(var) self.assertAllCloseAccordingToType(init_var_np, var_step_0_val, atol=1e-1) # Run another step of training. update.run() var_step_1_val = self.evaluate(var) # New update has the scale of the second diagonal adagrad update. adagrad_update = grad_np / np.sqrt(2 * np.square(grad_np)) preconditioned_grad_update = np.dot(np.dot(mat_left, grad_np), mat_right) # With normalization by diagonal enabled. var_step_1_np = init_var_np - preconditioned_grad_update * norm( adagrad_update) / norm(preconditioned_grad_update) self.assertAllCloseAccordingToType(var_step_1_np, var_step_1_val, atol=1e-1) # Compute new preconditioners. compute_preconditioner_op.run() assign_preconditioners_to_vars_op.run() # Gradients are summed over time. mat_g1 += np.dot(grad_np, grad_np.transpose()) mat_left = np_power(mat_g1, -0.25) expected_mat_left = self.evaluate( opt.get_slot(var, 'mat_preconditioner_0')) self.assertAllCloseAccordingToType(mat_left, expected_mat_left, atol=1e-1) mat_g2 += np.dot(grad_np.transpose(), grad_np) mat_right = np_power(mat_g2, -0.25) expected_mat_right = self.evaluate( opt.get_slot(var, 'mat_preconditioner_1')) self.assertAllCloseAccordingToType(mat_right, expected_mat_right, atol=1e-1)