def testConditionalMaskUpdate(self): weight = K.variable(np.linspace(1.0, 100.0, 100), name="weights") mask = K.ones(weight.get_shape()) threshold = K.zeros([]) def linear_sparsity(step): sparsity_val = tf.convert_to_tensor( [0.0, 0.1, 0.1, 0.3, 0.3, 0.5, 0.5, 0.5, 0.5, 0.5]) return tf.convert_to_tensor(True), sparsity_val[step] # Set up pruning p = pruning_impl.Pruning(pruning_vars=[(weight, mask, threshold)], training_step_fn=self.training_step_fn, pruning_schedule=linear_sparsity, block_size=self.block_size, block_pooling_type=self.block_pooling_type) non_zero_count = [] for _ in range(10): if tf.executing_eagerly(): p.conditional_mask_update() p.weight_mask_op() tf.assign_add(self.global_step, 1) else: K.get_session().run(p.conditional_mask_update()) K.get_session().run(p.weight_mask_op()) K.get_session().run(tf.assign_add(self.global_step, 1)) non_zero_count.append(np.count_nonzero(K.get_value(weight))) # Weights pruned at steps 1,3,5 expected_non_zero_count = [100, 90, 90, 70, 70, 50, 50, 50, 50, 50] self.assertAllEqual(expected_non_zero_count, non_zero_count)
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
def _ref_add_batch(self): ops = [] for g_ref, g in zip(self._grads_ref, self._grads): ops.append( tf.assign_add( g_ref, g * tf.cast(self._num_ref_samples_batch, dtype=self._loss.dtype))) ops.append( tf.assign_add(self._num_ref_samples, self._num_ref_samples_batch)) return tf.group(ops)
def testPlateauOpHook(self): global_step = tf.train.create_global_step() counter = tf.get_variable("count", initializer=0, dtype=tf.int32) indicator = tf.get_variable("indicator", initializer=0, dtype=tf.int32) tf.summary.scalar("count", counter) incr_global_step = tf.assign_add(global_step, 1) incr_counter = tf.assign_add(counter, 1) incr_indicator = tf.assign_add(indicator, 1) # Stop if the global step has not gone up by more than 1 in 20 steps. ckpt_dir = self.ckpt_dir("plateauop") stop_hook = metrics_hook.PlateauOpHook( ckpt_dir, "count_1", incr_indicator, num_plateau_steps=20, plateau_delta=1., plateau_decrease=False, every_n_steps=10) with self.sess(stop_hook, ckpt_dir) as sess: for _ in range(20): sess.run((incr_global_step, incr_counter)) # Summary files should now have 2 values in them self.flush() # Run for more steps so that the hook gets triggered and we verify that we # don't stop. for _ in range(30): sess.run((incr_global_step, incr_counter)) self.flush() # Run without incrementing the counter for _ in range(30): sess.run(incr_global_step) self.flush() self.assertTrue(sess.run(indicator) < 1) # Metrics should be written such that now the counter has gone >20 steps # without being incremented. # Check that we run the incr_indicator op several times for _ in range(3): for _ in range(10): sess.run(incr_global_step) self.flush() self.assertTrue(sess.run(indicator) > 1)
def _dense_moving_average(self, x_tm1, a_t, name, beta=.9): """ """ b_tm1 = self.get_accumulator(x_tm1, '%s' % name) tm1 = self.get_accumulator(x_tm1, '%s/tm1' % name, shape=[]) t = tf.assign_add(tm1, 1) if beta < 1: beta_t = tf.convert_to_tensor(beta, name='%s/decay' % name) beta_t = beta_t * (1 - beta**tm1) / (1 - beta**t) else: beta_t = tm1 / t b_t = tf.assign(b_tm1, beta_t * b_tm1) b_t = tf.assign_add(b_t, (1 - beta_t) * a_t) return b_t, t
def session_run_job(): with tf.Session() as sess: a = tf.Variable(10, dtype=tf.int32, name="a") b = tf.Variable(20, dtype=tf.int32, name="b") d = tf.constant(1, dtype=tf.int32, name="d") inc_a = tf.assign_add(a, d, name="inc_a") inc_b = tf.assign_add(b, d, name="inc_b") inc_ab = tf.group([inc_a, inc_b], name="inc_ab") sess.run(tf.global_variables_initializer()) sess = tf_debug.TensorBoardDebugWrapperSession( sess, self._debugger_url) session_run_results.append(sess.run(inc_ab))
def computation_fn(): graph = mtf.Graph() mesh = mtf.Mesh(graph, 'my_mesh') mesh_shape = mtf.convert_to_shape('all:2') layout = 'none:all' mesh_devices = [''] * mesh_shape.size mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl( mesh_shape, mtf.convert_to_layout_rules(layout), mesh_devices, device_assignment) hidden_dim = mtf.Dimension('hidden', 3) w = mtf.get_variable(mesh, 'w', shape=[hidden_dim], initializer=tf.constant_initializer( [0.1, -0.2, -0.1])) x = mtf.constant(mesh, [0.4, 0.2, -0.5], [hidden_dim], dtype=tf.float32) loss = mtf.reduce_mean(mtf.square(x - w)) lr, update_ops = optimization_lib.create_optimizer( loss, 0.2, 100, 10) self.lowering = mtf.Lowering(graph, {mesh: mesh_impl}) tf_update_ops = [ self.lowering.lowered_operation(op) for op in update_ops ] tf_update_ops.append( tf.assign_add(tf.train.get_or_create_global_step(), 1)) train_op = tf.group(tf_update_ops) return lr, train_op
def testPeriodicTargetUpdate(self, use_locking, update_period): """Tests that the simple success case works as expected. This is an integration test. The periodically and update parts are unit-tested in the preceding. Args: use_locking: value for `periodic_target_update`'s `use_locking` argument. update_period: how often an update should happen. """ target_variables = [tf.Variable(tf.zeros([1, 2]))] source_variables = [tf.Variable(tf.random_normal([1, 2]))] increment = tf.ones([1, 2]) update_source_op = tf.assign_add(source_variables[0], increment) updated = target_update_ops.periodic_target_update( target_variables, source_variables, update_period=update_period, use_locking=use_locking) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) for step in range(3 * update_period): sess.run(update_source_op) sess.run(updated) targets, sources = sess.run( [target_variables, source_variables]) if step % update_period == 0: self.assertAllClose(targets, sources) else: self.assertNotAllClose(targets, sources)
def get_train_ops(self, graph_ops, infeed_queue, i_tr, X_b_tr, y_b_tr): """Add training operations to the graph""" possible_xla = self.device_config['maybe_xla_compile'] # Need to close over scope of `self` for GPU XLA def train_op(loss, i, X, y): return tr(X) def tr(X): return self.train_ops(X) def tr_infeed(): loss = tf.zeros(self.loss_shape, self.experiment.dtype) return loops_repeat(self.device_config['device'], self.iters_per_sess_run, train_op, [loss], infeed_queue, maybe_xla=possible_xla) with self.graph.as_default(): graph_ops['incr_global_step'] = tf.assign_add( self.global_step, self.iters_per_sess_run) with self.device_config['scoper'](): if self.experiment.config.training: if self.use_infeed: graph_ops['train'] = tr_infeed() else: graph_ops['train'] = possible_xla(tr, [X_b_tr]) graph_ops['lr'] = self.get_current_learning_rate() graph_ops['epochs'] = self.get_epoch() return graph_ops
def _apply_and_zero(): apply_op = self._opt.apply_gradients(list(zip(accums, variables))) with tf.control_dependencies([apply_op]): zero_op = [ tf.assign(accum, tf.zeros_like(accum)) for accum in accums ] return tf.group(zero_op, tf.assign_add(self._counter, 1))
def testWeightSpecificSparsity(self): param_list = [ "begin_pruning_step=1", "pruning_frequency=1", "end_pruning_step=100", "target_sparsity=0.5", "weight_sparsity_map=[layer1:0.6,layer2/weights:0.75,.*kernel:0.6]", "threshold_decay=0.0" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) with tf.variable_scope("layer1"): w1 = tf.Variable(tf.linspace(1.0, 100.0, 100), name="weights") _ = pruning.apply_mask(w1) with tf.variable_scope("layer2"): w2 = tf.Variable(tf.linspace(1.0, 100.0, 100), name="weights") _ = pruning.apply_mask(w2) with tf.variable_scope("layer3"): w3 = tf.Variable(tf.linspace(1.0, 100.0, 100), name="kernel") _ = pruning.apply_mask(w3) p = pruning.Pruning(pruning_hparams) mask_update_op = p.conditional_mask_update_op() increment_global_step = tf.assign_add(self.global_step, 1) with self.cached_session() as session: tf.global_variables_initializer().run() for _ in range(110): session.run(mask_update_op) session.run(increment_global_step) self.assertAllClose( session.run(pruning.get_weight_sparsity()), [0.6, 0.75, 0.6])
def _update_dy(self): ops = [] for dy, g in zip(self._dys, self._grads): ops.append(tf.assign_add(dy, g)) return tf.group(ops)
def __init__(self, train_time, time_limit=None): super(TrainTimeHook, self).__init__() self._train_time = train_time self._time_limit = time_limit self._increment_amount = tf.placeholder(tf.float32, None) self._increment_op = tf.assign_add(train_time, self._increment_amount) self._last_run_duration = None
def build(self, input_shape): self.train_step = self.add_weight( name='train_step', shape=[], initializer='zeros', trainable=False) increment = tf.cast(tf.keras.backend.learning_phase(), tf.float32) increment_op = tf.assign_add(self.train_step, increment) self.add_update(increment_op) super().build(input_shape)
def learner(model, params): """Run a learner job.""" ds = dataset.load_dataset(FLAGS.dataset_dir, 'train') ds = dataset.add_targets(ds, [params['field']], add_history=params['history']) ds = dataset.split_and_preprocess(ds, noise_field=params['field'], noise_scale=params['noise'], noise_gamma=params['gamma']) inputs = tf.data.make_one_shot_iterator(ds).get_next() loss_op = model.loss(inputs) global_step = tf.train.create_global_step() lr = tf.train.exponential_decay(learning_rate=1e-4, global_step=global_step, decay_steps=int(5e6), decay_rate=0.1) + 1e-6 optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_op = optimizer.minimize(loss_op, global_step=global_step) # Don't train for the first few steps, just accumulate normalization stats train_op = tf.cond(tf.less(global_step, 1000), lambda: tf.group(tf.assign_add(global_step, 1)), lambda: tf.group(train_op)) with tf.train.MonitoredTrainingSession(hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.num_training_steps) ], checkpoint_dir=FLAGS.checkpoint_dir, save_checkpoint_secs=600) as sess: while not sess.should_stop(): _, step, loss = sess.run([train_op, global_step, loss_op]) if step % 1000 == 0: logging.info('Step %d: Loss %g', step, loss) logging.info('Training complete.')
def _update_weights(self): ops = [] for w, y in zip(self._weights, self._ys): ops.append(tf.assign_add(w, self._conf['lr'] * y)) return tf.group(ops)
def update_fisher_diag(self, n_task): # Reset is mandatory print('Mandatory fisher diagonal reset') self.reset_fisher_diag() print("Reset fishers computed") reset_ops = [] for fdc in self.objs['fisher_diagcs']: reset_ops += [tf.assign(fdc, tf.zeros_like(fdc))] self.objs['sess'].run(reset_ops) n_minibatches = self.it.n // self.fisher_batch_size self.it.i = 0 orig = self.objs['sess'].run(utils.sum_up(self.objs['fisher_diagcs'])) # imgs_sum = [] for batch in range(n_minibatches): # print("Batch %d" % batch) nX, nY = next(self.it) # imgs_sum += [np.sum(nY)] train_data = {self.phs['fisher_X']: nX, self.phs['fisher_Y']: nY} self.objs['sess'].run(self.objs['fisher_sum_up_ops'], feed_dict=train_data) # print(self.objs['sess'].run(self.objs['fisher_diagcs'][0])[0][0]) newv = self.objs['sess'].run(utils.sum_up(self.objs['fisher_diagcs'])) # print(orig, newv, n_minibatches, self.fisher_batch_size) # print(imgs_sum) print('Ran fisher_sum_up_ops (examples: %d)' % (n_minibatches * self.fisher_batch_size)) division_ops = [] for fdc in self.objs['fisher_diagcs']: division_ops += [ tf.assign( fdc, tf.divide(fdc, n_minibatches * self.fisher_batch_size)) ] self.objs['sess'].run(division_ops) shown_vars = self.objs['fisher_diags'] orig = self.objs['sess'].run(utils.sum_up(self.objs['fisher_diags'])) origs = ["%.2f" % orig] assign_ops = [] for fdc, fd in zip(self.objs['fisher_diagcs'], self.objs['fisher_diags']): assign_ops += [tf.assign_add(fd, fdc)] self.objs['sess'].run(assign_ops) newv = self.objs['sess'].run(utils.sum_up(self.objs['fisher_diags'])) newvs = ["%.2f" % newv] print("changed %s => %s" % (" , ".join(origs), " , ".join(newvs))) # print("SHOWN:") # self.print_vars(shown_vars) self.saved_fishers[n_task - 1] = [] # say task 0 save_ops = [] for fd in self.objs['fisher_diags']: self.saved_fishers[n_task - 1] += [tf.Variable(tf.zeros_like(fd))] save_ops += [tf.assign(self.saved_fishers[n_task - 1][-1], fd)] self.objs['sess'].run(save_ops) print("Saved fishers for task %d" % (n_task - 1))
def apply_gradients(self, grads_and_vars, global_step=None, name=None): with tf.init_scope(): self._create_slots([v for (_, v) in grads_and_vars]) accums = [] variables = [] for g, v in grads_and_vars: accum = self.get_slot(v, 'grad_accum') variables.append(v) if isinstance(g, tf.IndexedSlices): scaled_grad = tf.IndexedSlices( g.values / self._grad_steps, g.indices, dense_shape=g.dense_shape) accums.append(accum.assign_add(scaled_grad)) # pytype: disable=attribute-error else: accums.append(accum.assign_add(g / self._grad_steps)) # pytype: disable=attribute-error def _apply_and_zero(): apply_op = self._opt.apply_gradients(list(zip(accums, variables))) with tf.control_dependencies([apply_op]): zero_op = [tf.assign(accum, tf.zeros_like(accum)) for accum in accums] return tf.group(zero_op, tf.assign_add(self._counter, 1)) def _accum(): return tf.group(accums) accum_step = tf.cond( tf.equal(tf.mod(global_step, self._grad_steps), self._grad_steps - 1), _apply_and_zero, _accum) with tf.control_dependencies([accum_step]): global_step = tf.assign_add(global_step, 1) return tf.group(global_step)
def test_prune_every_n_steps(self): every_steps = 10 pruning_obj = MockPruningObject() with tf.Graph().as_default(): listener = pruning_hook.ModelPruningListener(pruning_obj) hook = pruning_hook.ModelPruningHook(every_steps=every_steps, listeners=[listener]) global_step = tf.train.get_or_create_global_step() train_op = tf.constant(0) global_step_increment_op = tf.assign_add(global_step, 1) with tf.train.MonitoredSession(tf.train.ChiefSessionCreator(), hooks=[hook]) as mon_sess: mon_sess.run(tf.global_variables_initializer()) mon_sess.run(train_op) mon_sess.run(global_step_increment_op) # ModelPruningHook runs once after session creation, at step 0. self.assertEqual(len(pruning_obj.logged_steps), 1) self.assertEqual(pruning_obj.logged_steps[0], 0) for _ in range(every_steps-1): mon_sess.run(train_op) mon_sess.run(global_step_increment_op) self.assertEqual(len(pruning_obj.logged_steps), 2) self.assertSameElements(pruning_obj.logged_steps, [0, every_steps]) for _ in range(every_steps-1): mon_sess.run(train_op) mon_sess.run(global_step_increment_op) self.assertEqual(len(pruning_obj.logged_steps), 2) self.assertSameElements(pruning_obj.logged_steps, [0, every_steps])
def testConditionalMaskUpdate(self): param_list = [ "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6", "nbins=100" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) weights = tf.Variable(tf.linspace(1.0, 100.0, 100), name="weights") masked_weights = pruning.apply_mask(weights) sparsity = tf.Variable(0.00, name="sparsity") # Set up pruning p = pruning.Pruning(pruning_hparams, sparsity=sparsity) p._spec.threshold_decay = 0.0 mask_update_op = p.conditional_mask_update_op() sparsity_val = tf.linspace(0.0, 0.9, 10) increment_global_step = tf.assign_add(self.global_step, 1) non_zero_count = [] with self.cached_session() as session: tf.global_variables_initializer().run() for i in range(10): session.run(tf.assign(sparsity, sparsity_val[i])) session.run(mask_update_op) session.run(increment_global_step) non_zero_count.append(np.count_nonzero(masked_weights.eval())) # Weights pruned at steps 0,2,4,and,6 expected_non_zero_count = [100, 100, 80, 80, 60, 60, 40, 40, 40, 40] self.assertAllEqual(expected_non_zero_count, non_zero_count)
def _create_var(name: str, value_expr: TfExpression) -> TfExpression: """Internal helper for creating autosummary accumulators.""" assert not _finalized name_id = name.replace("/", "_") v = tf.cast(value_expr, _dtype) if v.shape.is_fully_defined(): size = np.prod(v.shape.as_list()) size_expr = tf.constant(size, dtype=_dtype) else: size = None size_expr = tf.reduce_prod(tf.cast(tf.shape(v), _dtype)) if size == 1: if v.shape.ndims != 0: v = tf.reshape(v, []) v = [size_expr, v, tf.square(v)] else: v = [size_expr, tf.reduce_sum(v), tf.reduce_sum(tf.square(v))] v = tf.cond(tf.is_finite(v[1]), lambda: tf.stack(v), lambda: tf.zeros(3, dtype=_dtype)) with tfutil.absolute_name_scope("Autosummary/" + name_id), tf.control_dependencies(None): var = tf.Variable(tf.zeros(3, dtype=_dtype), trainable=False) # [sum(1), sum(x), sum(x**2)] update_op = tf.cond(tf.is_variable_initialized(var), lambda: tf.assign_add(var, v), lambda: tf.assign(var, v)) if name in _vars: _vars[name].append(var) else: _vars[name] = [var] return update_op
def testWeightSparsityTiebreaker(self): param_list = [ "begin_pruning_step=1", "pruning_frequency=1", "end_pruning_step=100", "target_sparsity=0.5", "threshold_decay=0.0" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) with tf.variable_scope("layer1"): w1 = tf.Variable(np.ones([100], dtype=np.float32), name="weights") _ = pruning.apply_mask(w1) p = pruning.Pruning(pruning_hparams) mask_update_op = p.conditional_mask_update_op() increment_global_step = tf.assign_add(self.global_step, 1) with self.cached_session() as session: tf.global_variables_initializer().run() for _ in range(110): session.run(mask_update_op) session.run(increment_global_step) self.assertAllClose( session.run(pruning.get_weight_sparsity()), [0.5])
def __init__(self, update_batchnorm_params=True): self.update_batchnorm_params = update_batchnorm_params num_samples = datasets.get_count(FLAGS.train_split) if FLAGS.num_supervised_examples: num_samples = FLAGS.num_supervised_examples steps_per_epoch = num_samples // FLAGS.batch_size self.steps_per_epoch = steps_per_epoch global_step = tf.train.get_or_create_global_step() self.global_step_inc = tf.assign_add(global_step, 1) # lr_scale_batch_size defines a canonical batch size that is coupled with # the initial learning rate. If actual batch size is not the same as # canonical than learning rate is linearly scaled. This is very convinient # as this allows to vary batch size without recomputing learning rate. lr_factor = 1.0 if FLAGS.lr_scale_batch_size: lr_factor = FLAGS.batch_size / float(FLAGS.lr_scale_batch_size) # We actually also accept fractional epochs. schedule_in_steps = utils.get_schedule_from_config( FLAGS.schedule, steps_per_epoch) warmup, decays = schedule_in_steps[0], schedule_in_steps[1:-1] self.lr = get_lr(global_step, base_lr=FLAGS.lr * lr_factor, decay_steps=decays, lr_decay_factor=FLAGS.lr_decay_factor, warmup_steps=warmup)
def _reset_non_empty(self, indices): """Reset the batch of environments. Args: indices: The batch indices of the environments to reset; defaults to all. Returns: Batch tensor of the new observations. """ reset_video_op = tf.cond( self._video_condition, lambda: tf.py_func(self._video_reset_writer, [], []), tf.no_op) with tf.control_dependencies([reset_video_op]): inc_op = tf.assign_add(self._episode_counter, 1) with tf.control_dependencies( [self.history_buffer.reset(indices), inc_op]): initial_frame_dump_op = tf.cond( self._video_condition, lambda: tf.py_func( self._video_dump_frames, # pylint: disable=g-long-lambda [self.history_buffer.get_all_elements()], []), tf.no_op) observ_assign_op = self._observ.assign( self.history_buffer.get_all_elements()[:, -1, ...]) with tf.control_dependencies( [observ_assign_op, initial_frame_dump_op]): reset_model_op = tf.assign(self._reset_model, tf.constant(1.0)) with tf.control_dependencies([reset_model_op]): return tf.gather(self._observ.read_value(), indices)
def get_train_op(loss, initial_learning_rate, momentum, lr_decay_factor, decay_steps, warmup_steps, use_tpu=False): """Builds an SGD update operation.""" global_step = tf.train.get_or_create_global_step() lr = tf.train.piecewise_constant(global_step, decay_steps, [ initial_learning_rate * (lr_decay_factor**i) for i in range(len(decay_steps) + 1) ]) lr = apply_warmup_lr(global_step, lr, initial_learning_rate, warmup_steps) optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=momentum) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) global_step_inc_op = tf.assign_add(global_step, 1) return tf.group([train_op, update_ops, global_step_inc_op])
def testStop(self): global_step = tf.train.create_global_step() tf.summary.scalar("global_step", global_step) incr_global_step = tf.assign_add(global_step, 1) ckpt_dir = self.ckpt_dir("stop") dummy = DummyHook(ckpt_dir, every_n_steps=10) with self.sess(dummy, ckpt_dir) as sess: for _ in range(20): sess.run(incr_global_step) # Summary files should now have 2 global step values in them self.flush() # Run for 10 more so that the hook gets triggered again for _ in range(10): sess.run(incr_global_step) # Check that the metrics have actually been collected. self.assertTrue("" in dummy.test_metrics) metrics = dummy.test_metrics[""] self.assertTrue("global_step_1" in metrics) steps, vals = metrics["global_step_1"] self.assertTrue(len(steps) == len(vals)) self.assertTrue(len(steps) >= 2) # Run for 10 more so that the hook triggers stoppage for _ in range(10): sess.run(incr_global_step) with self.assertRaisesRegexp(RuntimeError, "after should_stop requested"): sess.run(incr_global_step)
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Applying gradients and tune hyperparams with YellowFin. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: (A group of operations) Variable Update with Momentum ops, YellowFin ops(Curvature, Variance, Distance) ops, SingleStep and lr_mu tuning ops, Step increment ops. """ self._grad, self._vars = zip(*[(g, t) for g, t in grads_and_vars if g is not None]) # Var update with Momentum. with tf.variable_scope("apply_updates"): # Gradient Clipping? if self._clip_thresh_var is not None: self._grad, _ = tf.clip_by_global_norm(self._grad, self._clip_thresh_var) apply_grad_op = self._momentum_optimizer.apply_gradients( zip(self._grad, self._vars), global_step=global_step, name=name) else: apply_grad_op = self._momentum_optimizer.apply_gradients( zip(self._grad, self._vars), global_step=global_step, name=name) # Begin lr and mu tuning. with tf.variable_scope("prepare_yellowFin_variables"): # the dependencies ideally only need to be after clip is done, # i.e. depends on self._grads. However, the control_dependencies # does not support indexed slice for sparse gradients. # The alternative dependencies here might be slightly slower due # to less parallelization. with tf.control_dependencies([ apply_grad_op, ]): prepare_variables_op = self._prepare_variables() with tf.variable_scope("yellowfin"): with tf.control_dependencies([prepare_variables_op]): yellowfin_op = self._yellowfin() # Update YellowFin step variable. with tf.control_dependencies([yellowfin_op]): self._increment_step_op = tf.assign_add(self._step, 1).op return tf.group(apply_grad_op, prepare_variables_op, yellowfin_op, self._increment_step_op)
def _update_weights(self): ops = [] for w, dw in zip(self._weights, self._dws): ops.append(tf.assign_add(w, self._step_size_placeh * dw)) return tf.group(ops)
def _build(self, x, weights=None): if weights is None: weights = tf.ones_like(x) if weights.get_shape().as_list() != x.get_shape().as_list(): weights = tf.broadcast_to(weights, x.get_shape().as_list()) sum_weights = tf.reduce_sum(weights, axis=self._axis) shape = sum_weights.get_shape().as_list() total = tf.get_variable( "total", shape=shape, dtype=weights.dtype, initializer=tf.zeros_initializer(), trainable=False, ) mean = tf.get_variable( "mean", shape=shape, dtype=x.dtype, initializer=tf.zeros_initializer(), trainable=False, ) m2 = tf.get_variable( "M2", shape=shape, dtype=x.dtype, initializer=tf.zeros_initializer(), trainable=False, ) total_update = tf.assign_add(total, sum_weights) with tf.control_dependencies([total_update]): delta = (x - mean) * weights mean_update = tf.assign_add( mean, tf.reduce_sum(delta, axis=self._axis) / total) with tf.control_dependencies([mean_update]): delta2 = x - mean m2_update = tf.assign_add( m2, tf.reduce_sum(delta * delta2, axis=self._axis)) with tf.control_dependencies([m2_update]): return tf.identity(mean), m2 / (total - self._ddof), tf.identity(total)
def _build_model(self, weights=None): """ Builds TensorFlow model. :return: """ # initialize weights and biases self._initialize_weights(weights) # TensorFlow operations self.visible_units_placeholder = tf.placeholder(tf.float32, shape=[None, self.n_visible_units]) self.compute_hidden_units_op = self._activation_function_class( tf.transpose(tf.matmul(self.W, tf.transpose(self.visible_units_placeholder))) + self.c) self.hidden_units_placeholder = tf.placeholder(tf.float32, shape=[None, self.n_hidden_units]) self.compute_visible_units_op = self._activation_function_class( tf.matmul(self.hidden_units_placeholder, self.W) + self.b) self.random_uniform_values = tf.Variable(tf.random_uniform([self.batch_size, self.n_hidden_units])) sample_hidden_units_op = tf.to_float(self.random_uniform_values < self.compute_hidden_units_op) self.random_variables = [self.random_uniform_values] # Positive gradient # Outer product. N is the batch size length. # From http://stackoverflow.com/questions/35213787/tensorflow-batch-outer-product positive_gradient_op = tf.matmul(tf.expand_dims(sample_hidden_units_op, 2), # [N, U, 1] tf.expand_dims(self.visible_units_placeholder, 1)) # [N, 1, V] # Negative gradient # Gibbs sampling sample_hidden_units_gibbs_step_op = sample_hidden_units_op for t in range(self.contrastive_divergence_iter): compute_visible_units_op = self._activation_function_class( tf.matmul(sample_hidden_units_gibbs_step_op, self.W) + self.b) compute_hidden_units_gibbs_step_op = self._activation_function_class( tf.transpose(tf.matmul(self.W, tf.transpose(compute_visible_units_op))) + self.c) random_uniform_values = tf.Variable(tf.random_uniform([self.batch_size, self.n_hidden_units])) sample_hidden_units_gibbs_step_op = tf.to_float(random_uniform_values < compute_hidden_units_gibbs_step_op) self.random_variables.append(random_uniform_values) negative_gradient_op = tf.matmul(tf.expand_dims(sample_hidden_units_gibbs_step_op, 2), # [N, U, 1] tf.expand_dims(compute_visible_units_op, 1)) # [N, 1, V] compute_delta_W = tf.reduce_mean(positive_gradient_op - negative_gradient_op, 0) compute_delta_b = tf.reduce_mean(self.visible_units_placeholder - compute_visible_units_op, 0) compute_delta_c = tf.reduce_mean(sample_hidden_units_op - sample_hidden_units_gibbs_step_op, 0) self.update_W = tf.assign_add(self.W, self.learning_rate * compute_delta_W) self.update_b = tf.assign_add(self.b, self.learning_rate * compute_delta_b) self.update_c = tf.assign_add(self.c, self.learning_rate * compute_delta_c)