def streaming_tp_fp_arrays(num_gbboxes, tp, fp, metrics_collections=None, updates_collections=None, name=None): """Streaming computation of True and False Positive arrays. """ with variable_scope.variable_scope(name, 'streaming_tp_fp', [num_gbboxes, tp, fp]): num_gbboxes = tf.cast(num_gbboxes, tf.int32) tp = tf.cast(tp, tf.bool) fp = tf.cast(fp, tf.bool) # Reshape TP and FP tensors and clean away 0 class values. tp = tf.reshape(tp, [-1]) fp = tf.reshape(fp, [-1]) # Local variables accumlating information over batches. v_num_objects = _create_local('v_num_gbboxes', shape=[], dtype=tf.int32) v_tp = _create_local('v_tp', shape=[0, ], dtype=tf.bool) v_fp = _create_local('v_fp', shape=[0, ], dtype=tf.bool) # Update operations. num_objects_op = state_ops.assign_add(v_num_objects, tf.reduce_sum(num_gbboxes)) tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp], axis=0), validate_shape=False) fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp], axis=0), validate_shape=False) # Value and update ops. val = (v_num_objects, v_tp, v_fp) with ops.control_dependencies([num_objects_op, tp_op, fp_op]): update_op = (num_objects_op, tp_op, fp_op) return val, update_op
def _Update_global_variables(): global_norm = [] # a = a / t for g in grad_vars: global_norm.append(state_ops.assign(g, g / self._period)) # apply with ops.control_dependencies(global_norm): apply_global_op = self._opt.apply_gradients( zip(grad_vars, global_center_vars)) # pull with ops.control_dependencies([apply_global_op]): update_ops = [] if global_step: with ops.colocate_with(global_step): update_ops.append(state_ops.assign_add(global_step, 1)) for lvar in local_vars: g_val = self._global_map[lvar].read_value() update_ops.append(state_ops.assign(lvar, g_val)) for grad_var in grad_vars: update_ops.append( state_ops.assign(grad_var, array_ops.zeros_like(grad_var))) variable_update = control_flow_ops.group(*(update_ops)) return variable_update
def _apply_sparse(self, grad, var): beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad.values * (1 - beta1_t) m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values, use_locking=self._use_locking) # u_t = max(beta_2 * u_{t-1}, L1(g_t)) # theta_t = theta_{t-1} - alpha/(1-beta_1).m_t/u_t v = self.get_slot(var, "v") g_abs_values = tensorflow.abs(g_t) v_t = state_ops.assign(v, v * beta_2, use_locking = self._use_locking) v_t = state_ops.assign_max(v_t, grad.indices, g_abs_values, use_locking=self._use_locking) var_update = state_ops.assign_sub(var, lr*m_t/(v_t*(1 - beta_1)), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def testMultiplyInverseAgainstExplicit(self): with ops.Graph().as_default(), self.test_session() as sess: random_seed.set_random_seed(200) input_dim, output_dim = 3, 2 inputs = array_ops.zeros([32, input_dim]) outputs = array_ops.zeros([32, output_dim]) params = array_ops.zeros([input_dim, output_dim]) block = fb.FullyConnectedKFACBasicFB( lc.LayerCollection(), inputs, outputs, has_bias=False) grads = outputs**2 damping = 0. # This test is only valid without damping. block.instantiate_factors((grads,), damping) sess.run(state_ops.assign(block._input_factor._cov, _make_psd(3))) sess.run(state_ops.assign(block._output_factor._cov, _make_psd(2))) sess.run(block._input_factor.make_inverse_update_ops()) sess.run(block._output_factor.make_inverse_update_ops()) v_flat = np.arange(6, dtype=np.float32) vector = utils.column_to_tensors(params, array_ops.constant(v_flat)) output = block.multiply_inverse(vector) output_flat = sess.run(utils.tensors_to_column(output)).ravel() full = sess.run(block.full_fisher_block()) explicit = np.dot(np.linalg.inv(full + damping * np.eye(6)), v_flat) self.assertAllClose(output_flat, explicit)
def _f(): # Note that there is a race condition here, so we do a best effort # updates here. We reset update_in_steps first so that other workers # don't duplicate the updates. Also we update cluster_center_vars # before resetting total_counts to avoid large updates to # cluster_centers_updated based on partially updated # cluster_center_vars. with ops.control_dependencies([ state_ops.assign(update_in_steps, self._mini_batch_steps_per_iteration - 1) ]): with ops.colocate_with( cluster_centers_updated, ignore_existing=True): if self._distance_metric == COSINE_DISTANCE: cluster_centers = nn_impl.l2_normalize( cluster_centers_updated, dim=1) else: cluster_centers = cluster_centers_updated with ops.colocate_with(cluster_centers_var, ignore_existing=True): with ops.control_dependencies( [state_ops.assign(cluster_centers_var, cluster_centers)]): with ops.colocate_with(None, ignore_existing=True): with ops.control_dependencies([ state_ops.assign(total_counts, array_ops.zeros_like(total_counts)) ]): return array_ops.identity(update_in_steps)
def _testDefaultGraphInThread(self, constructed_event, continue_event, i): with session.Session() as s: self.assertEqual(ops.get_default_graph(), s.graph) a = constant_op.constant(1.0, shape=[1, 2]) b = constant_op.constant(2.0, shape=[2, 3]) c = math_ops.matmul(a, b) v = variables.Variable(c, name='var_%d' % i) # Block here until all threads have constructed their graph. constructed_event.set() continue_event.wait() assign_c_to_v = state_ops.assign(v, c) v.initializer.run() assign_c_to_v.eval() v_val = v.eval() self.assertAllEqual([[4.0, 4.0, 4.0]], v_val) d = constant_op.constant(3.0, shape=[2, 3]) e = math_ops.matmul(a, d) assign_e_to_v = state_ops.assign(v, e) e_val = e.eval() self.assertAllEqual([[6.0, 6.0, 6.0]], e_val) v_val = v.eval() self.assertAllEqual([[4.0, 4.0, 4.0]], v_val) s.run(assign_e_to_v) v_val = v.eval() self.assertAllEqual([[6.0, 6.0, 6.0]], v_val) self.assertEqual(ops.get_default_graph(), s.graph)
def testMultiplyInverseAgainstExplicit(self): with ops.Graph().as_default(), self.test_session() as sess: random_seed.set_random_seed(200) params = array_ops.zeros((2, 2, 2, 2)) inputs = array_ops.zeros((2, 2, 2, 2)) outputs = array_ops.zeros((2, 2, 2, 2)) block = fb.ConvKFCBasicFB(lc.LayerCollection(), params, (1, 1, 1, 1), 'SAME') block.register_additional_minibatch(inputs, outputs) grads = outputs**2 damping = 0. # This test is only valid without damping. block.instantiate_factors(([grads],), damping) sess.run(state_ops.assign(block._input_factor._cov, _make_psd(8))) sess.run(state_ops.assign(block._output_factor._cov, _make_psd(2))) sess.run(block._input_factor.make_inverse_update_ops()) sess.run(block._output_factor.make_inverse_update_ops()) v_flat = np.arange(16, dtype=np.float32) vector = utils.column_to_tensors(params, array_ops.constant(v_flat)) output = block.multiply_inverse(vector) output_flat = sess.run(utils.tensors_to_column(output)).ravel() full = sess.run(block.full_fisher_block()) explicit = np.dot(np.linalg.inv(full + damping * np.eye(16)), v_flat) self.assertAllClose(output_flat, explicit)
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): v = self.momentum * m - lr * g # velocity self.updates.append(state_ops.assign(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def test_fn(a): state_ops.assign(a, a + 1) b = a + 1 state_ops.assign(a, a + 1) c = b + 1 d = c + 1 return d
def get_placements(self, *args, **kwargs): num_children = self.hparams.num_children with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)): actions_cache = variable_scope.get_local_variable( "actions_cache", initializer=init_ops.zeros_initializer, dtype=dtypes.int32, shape=[num_children, self.num_groups], trainable=False) x = array_ops.tile(self.seq2seq_input_layer, [num_children, 1, 1]) last_c, last_h, attn_mem = self.encode(x) actions, log_probs = {}, {} actions["sample"], log_probs["sample"] = ( self.decode( x, last_c, last_h, attn_mem, mode="sample")) actions["target"], log_probs["target"] = ( self.decode( x, last_c, last_h, attn_mem, mode="target", y=actions_cache)) actions["greedy"], log_probs["greedy"] = ( self.decode( x, last_c, last_h, attn_mem, mode="greedy")) actions["sample"] = control_flow_ops.cond( self.global_step < self.hparams.stop_sampling, lambda: state_ops.assign(actions_cache, actions["sample"]), lambda: state_ops.assign(actions_cache, actions["target"])) self.actions_cache = actions_cache return actions, log_probs
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) shapes = [K.int_shape(p) for p in params] accumulators = [K.zeros(shape) for shape in shapes] delta_accumulators = [K.zeros(shape) for shape in shapes] self.weights = accumulators + delta_accumulators self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): # update accumulator new_a = self.rho * a + (1. - self.rho) * math_ops.square(g) self.updates.append(state_ops.assign(a, new_a)) # use the new accumulator and the *old* delta_accumulator update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) new_p = p - lr * update # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update) self.updates.append(state_ops.assign(d_a, new_d_a)) return self.updates
def testIsVariableInitialized(self): for use_gpu in [True, False]: with self.test_session(use_gpu=use_gpu): v0 = state_ops.variable_op([1, 2], dtypes.float32) self.assertEqual(False, variables.is_variable_initialized(v0).eval()) state_ops.assign(v0, [[2.0, 3.0]]).eval() self.assertEqual(True, variables.is_variable_initialized(v0).eval())
def test_stop_based_on_num_step(self): h = basic_session_run_hooks.StopAtStepHook(num_steps=10) with ops.Graph().as_default(): global_step = variables.get_or_create_global_step() no_op = control_flow_ops.no_op() h.begin() with session_lib.Session() as sess: mon_sess = monitored_session._HookedSession(sess, [h]) sess.run(state_ops.assign(global_step, 5)) h.after_create_session(sess, None) mon_sess.run(no_op) self.assertFalse(mon_sess.should_stop()) sess.run(state_ops.assign(global_step, 13)) mon_sess.run(no_op) self.assertFalse(mon_sess.should_stop()) sess.run(state_ops.assign(global_step, 14)) mon_sess.run(no_op) self.assertFalse(mon_sess.should_stop()) sess.run(state_ops.assign(global_step, 15)) mon_sess.run(no_op) self.assertTrue(mon_sess.should_stop()) sess.run(state_ops.assign(global_step, 16)) mon_sess._should_stop = False mon_sess.run(no_op) self.assertTrue(mon_sess.should_stop())
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) shapes = [K.int_shape(p) for p in params] accumulators = [K.zeros(shape) for shape in shapes] self.weights = accumulators self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) for p, g, a in zip(params, grads, accumulators): new_a = a + math_ops.square(g) # update accumulator self.updates.append(state_ops.assign(a, new_a)) new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def _apply_dense(self, grad, var): beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking) # amsgrad vhat = self.get_slot(var, "vhat") vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat)) v_sqrt = math_ops.sqrt(vhat_t) var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
def _resource_apply_sparse(self, grad, var, indices): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) local_step = math_ops.cast(self.iterations + 1, var_dtype) beta_1_power = math_ops.pow(beta_1_t, local_step) beta_2_power = math_ops.pow(beta_2_t, local_step) epsilon_t = self._get_hyper('epsilon', var_dtype) lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, 'm') m_scaled_g_values = grad * (1 - beta_1_t) m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) # m_bar = (1 - beta1) * g_t + beta1 * m_t m_bar = m_scaled_g_values + beta_1_t * array_ops.gather(m_t, indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, 'v') v_scaled_g_values = (grad * grad) * (1 - beta_2_t) v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) v_t_slice = array_ops.gather(v_t, indices) v_sqrt = math_ops.sqrt(v_t_slice) var_update = self._resource_scatter_add(var, indices, -lr * m_bar / (v_sqrt + epsilon_t)) return control_flow_ops.group(*[var_update, m_bar, v_t])
def testDeferredSlotRestoration(self): checkpoint_directory = self.get_temp_dir() root = trackable_utils.Checkpoint() root.var = trackable_utils.add_variable( root, name="var", initializer=0.) optimizer = adam.AdamOptimizer(0.1) if context.executing_eagerly(): optimizer.minimize(root.var.read_value) else: train_op = optimizer.minimize(root.var) # Note that `optimizer` has not been added as a dependency of # `root`. Create a one-off grouping so that slot variables for `root.var` # get initialized too. self.evaluate(trackable_utils.gather_initializers( trackable_utils.Checkpoint(root=root, optimizer=optimizer))) self.evaluate(train_op) self.evaluate(state_ops.assign(root.var, 12.)) no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots")) root.optimizer = optimizer self.evaluate(state_ops.assign(root.var, 13.)) self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var), 14.)) slots_path = root.save(os.path.join(checkpoint_directory, "with_slots")) new_root = trackable_utils.Checkpoint() # Load the slot-containing checkpoint (deferred), then immediately overwrite # the non-slot variable (also deferred). slot_status = new_root.restore(slots_path) no_slot_status = new_root.restore(no_slots_path) with self.assertRaises(AssertionError): no_slot_status.assert_consumed() new_root.var = trackable_utils.add_variable( new_root, name="var", shape=[]) no_slot_status.assert_consumed() no_slot_status.run_restore_ops() self.assertEqual(12., self.evaluate(new_root.var)) new_root.optimizer = adam.AdamOptimizer(0.1) slot_status.assert_existing_objects_matched() with self.assertRaisesRegexp(AssertionError, "beta1_power"): slot_status.assert_consumed() self.assertEqual(12., self.evaluate(new_root.var)) if context.executing_eagerly(): # Slot variables are only created with restoring initializers when # executing eagerly. self.assertEqual(14., self.evaluate( new_root.optimizer.get_slot(name="m", var=new_root.var))) else: self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var), None) if context.executing_eagerly(): new_root.optimizer.minimize(new_root.var.read_value) else: train_op = new_root.optimizer.minimize(new_root.var) # The slot variable now exists; restore() didn't create it, but we should # now have a restore op for it. slot_status.run_restore_ops() self.assertEqual(14., self.evaluate( new_root.optimizer.get_slot(name="m", var=new_root.var))) self.evaluate(train_op) slot_status.assert_consumed()
def _apply_sparse_shared(self, grad, var, indices, scatter_add): beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = scatter_add(m, indices, m_scaled_g_values) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = scatter_add(v, indices, v_scaled_g_values) v_sqrt = math_ops.sqrt(v_t) var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def test_fn(a): with ops.name_scope('foo'): state_ops.assign(a, a + 1) b = a + 1 # state_ops.assign(a, b + 1) c = b + 1 d = c + 1 return d
def testSaveRestore(self): network = MyNetwork() optimizer = adam.AdamOptimizer(0.001) root_checkpointable = checkpointable_utils.Checkpoint( optimizer=optimizer, network=network) input_value = constant_op.constant([[3.]]) if context.in_eager_mode(): optimizer.minimize( lambda: network(input_value)) else: train_op = optimizer.minimize(network(input_value)) # TODO(allenl): Make initialization more pleasant when graph building. root_checkpointable.save_counter # pylint: disable=pointless-statement self.evaluate(checkpointable_utils.gather_initializers( root_checkpointable)) self.evaluate(train_op) prefix = os.path.join(self.get_temp_dir(), "ckpt") self.evaluate(state_ops.assign(network._named_dense.variables[1], [42.])) m_bias_slot = optimizer.get_slot(network._named_dense.variables[1], "m") self.evaluate(state_ops.assign(m_bias_slot, [1.5])) save_path = root_checkpointable.save(file_prefix=prefix) self.evaluate(state_ops.assign(network._named_dense.variables[1], [43.])) self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3)) optimizer_variables = self.evaluate(optimizer.variables()) self.evaluate(state_ops.assign(m_bias_slot, [-2.])) # Immediate restoration status = root_checkpointable.restore(save_path=save_path).assert_consumed() status.run_restore_ops() self.assertAllEqual([42.], self.evaluate(network._named_dense.variables[1])) self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter)) self.assertAllEqual([1.5], self.evaluate(m_bias_slot)) if context.in_graph_mode(): return # Restore-on-create is only supported when executing eagerly on_create_network = MyNetwork() on_create_optimizer = adam.AdamOptimizer(0.001) on_create_root = checkpointable_utils.Checkpoint( optimizer=on_create_optimizer, network=on_create_network) # Deferred restoration status = on_create_root.restore(save_path=save_path) on_create_network(constant_op.constant([[3.]])) # create variables self.assertAllEqual(1, self.evaluate(on_create_root.save_counter)) self.assertAllEqual([42.], self.evaluate( on_create_network._named_dense.variables[1])) on_create_m_bias_slot = on_create_optimizer.get_slot( on_create_network._named_dense.variables[1], "m") # Optimizer slot variables are created when the original variable is # restored. self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot)) self.assertAllEqual(optimizer_variables[2:], self.evaluate(on_create_optimizer.variables())) on_create_optimizer._create_slots( [resource_variable_ops.ResourceVariable([1.])]) status.assert_consumed() beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators() self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power)) self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
def testDeferredSlotRestoration(self): checkpoint_directory = self.get_temp_dir() root = checkpointable.Checkpointable() root.var = checkpointable_utils.add_variable( root, name="var", initializer=0.) optimizer = CheckpointableAdam(0.1) if context.in_graph_mode(): train_op = optimizer.minimize(root.var) self.evaluate(variables.global_variables_initializer()) self.evaluate(train_op) else: optimizer.minimize(root.var.read_value) self.evaluate(state_ops.assign(root.var, 12.)) no_slots_path = checkpointable_utils.Saver(root).save( os.path.join(checkpoint_directory, "no_slots")) root.optimizer = optimizer self.evaluate(state_ops.assign(root.var, 13.)) self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var), 14.)) slots_path = checkpointable_utils.Saver(root).save( os.path.join(checkpoint_directory, "with_slots")) new_root = checkpointable.Checkpointable() # Load the slot-containing checkpoint (deferred), then immediately overwrite # the non-slot variable (also deferred). slot_status = checkpointable_utils.Saver(new_root).restore(slots_path) no_slot_status = checkpointable_utils.Saver(new_root).restore(no_slots_path) with self.assertRaises(AssertionError): no_slot_status.assert_consumed() new_root.var = checkpointable_utils.add_variable( new_root, name="var", shape=[]) no_slot_status.assert_consumed() no_slot_status.run_restore_ops() self.assertEqual(12., self.evaluate(new_root.var)) new_root.optimizer = CheckpointableAdam(0.1) with self.assertRaisesRegexp(AssertionError, "beta1_power"): slot_status.assert_consumed() self.assertEqual(12., self.evaluate(new_root.var)) if context.in_eager_mode(): # Slot variables are only created with restoring initializers when # executing eagerly. self.assertEqual(14., self.evaluate( new_root.optimizer.get_slot(name="m", var=new_root.var))) else: self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var), None) if context.in_graph_mode(): train_op = new_root.optimizer.minimize(new_root.var) # The slot variable now exists; restore() didn't create it, but we should # now have a restore op for it. slot_status.run_restore_ops() self.assertEqual(14., self.evaluate( new_root.optimizer.get_slot(name="m", var=new_root.var))) self.evaluate(train_op) else: new_root.optimizer.minimize(new_root.var.read_value) slot_status.assert_consumed()
def test_resource_variable(self): """Tests that resource variable usage is allowed.""" a = variable_scope.get_variable( name='variable_a', shape=(1), use_resource=True) context = self.create_test_xla_compile_context() context.Enter() state_ops.assign(a, a + 1) context.Exit()
def _resource_apply_sparse(self, grad, var, indices): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) rms = self.get_slot(var, "rms") rho = self._get_hyper("rho", var_dtype) momentum = self._get_hyper("momentum", var_dtype) epsilon = self._get_hyper("epsilon", var_dtype) if self._momentum: mom = self.get_slot(var, "momentum") if self.centered: mg = self.get_slot(var, "mg") return training_ops.resource_sparse_apply_centered_rms_prop( var.handle, mg.handle, rms.handle, mom.handle, lr_t, rho, momentum, epsilon, grad, indices, use_locking=self._use_locking) else: return training_ops.resource_sparse_apply_rms_prop( var.handle, rms.handle, mom.handle, lr_t, rho, momentum, epsilon, grad, indices, use_locking=self._use_locking) else: rms_scaled_g_values = (grad * grad) * (1. - rho) rms_t = state_ops.assign(rms, rms * rho, use_locking=self._use_locking) with ops.control_dependencies([rms_t]): rms_t = self._resource_scatter_add(rms, indices, rms_scaled_g_values) rms_slice = array_ops.gather(rms_t, indices) denom_slice = rms_slice if self.centered: mg = self.get_slot(var, "mg") mg_scaled_g_values = grad * (1. - rho) mg_t = state_ops.assign(mg, mg * rho, use_locking=self._use_locking) with ops.control_dependencies([mg_t]): mg_t = self._resource_scatter_add(mg, indices, mg_scaled_g_values) mg_slice = array_ops.gather(mg_t, indices) denom_slice = rms_slice - math_ops.square(mg_slice) var_update = self._resource_scatter_add( var, indices, -lr_t * grad / (math_ops.sqrt(denom_slice) + epsilon)) if self.centered: return control_flow_ops.group(*[var_update, rms_t, mg_t]) return control_flow_ops.group(*[var_update, rms_t])
def testDuplicateTemporaryVariable(self): with test_util.use_gpu(): var1 = gen_state_ops.temporary_variable( [1, 2], dtypes.float32, var_name="dup") var1 = state_ops.assign(var1, [[1.0, 2.0]]) var2 = gen_state_ops.temporary_variable( [1, 2], dtypes.float32, var_name="dup") var2 = state_ops.assign(var2, [[3.0, 4.0]]) final = var1 + var2 with self.assertRaises(errors.AlreadyExistsError): self.evaluate(final)
def _cached_copy(self, var, name, pass_through=False): """Helper function to create a worker cached copy of a Variable. This assigns the var (either a single Variable or a list of Variables) to local transient cache Variable(s). Note that if var is a list of Variables, the assignment is done sequentially to minimize the memory overheads. Also note that if pass_through is set to True, this does not create new Variables but simply return the input back. Args: var: A Variable or a list of Variables to cache. name: name of cached Variable. pass_through: when set to True, this simply pass through the var back through identity operator and does not actually creates a cache. Returns: Tuple consisting of following three entries: cache: the new transient Variable or list of transient Variables corresponding one-to-one with var. cache_init: op to initialize the Variable or the list of Variables. cache_reset: op to reset the Variable or the list of Variables to some default value. """ if var is None: return None, None, None elif pass_through: cache = var cache_init = control_flow_ops.no_op() cache_reset = control_flow_ops.no_op() elif isinstance(var, variables.Variable): cache = WALSModel._transient_var(name=name) with ops.colocate_with(cache): cache_init = state_ops.assign(cache, var, validate_shape=False) cache_reset = state_ops.assign(cache, 1.0, validate_shape=False) else: assert isinstance(var, list) assert var cache = [ WALSModel._transient_var(name="%s_shard_%d" % (name, i)) for i in xrange(len(var)) ] reset_ops = [] for i, c in enumerate(cache): with ops.colocate_with(c): if i == 0: cache_init = state_ops.assign(c, var[i], validate_shape=False) else: with ops.control_dependencies([cache_init]): cache_init = state_ops.assign(c, var[i], validate_shape=False) reset_ops.append(state_ops.assign(c, 1.0, validate_shape=False)) cache_reset = control_flow_ops.group(*reset_ops) return cache, cache_init, cache_reset
def create_axis_ops(sp_input, num_items, update_fn, axis_name): """Creates book-keeping and training ops for a given axis. Args: sp_input: A SparseTensor corresponding to the row or column batch. num_items: An integer, the total number of items of this axis. update_fn: A function that takes one argument (`sp_input`), and that returns a tuple of * new_factors: A flot Tensor of the factor values after update. * update_op: a TensorFlow op which updates the factors. * loss: A float Tensor, the unregularized loss. * reg_loss: A float Tensor, the regularization loss. * sum_weights: A float Tensor, the sum of factor weights. axis_name: A string that specifies the name of the axis. Returns: A tuple consisting of: * reset_processed_items_op: A TensorFlow op, to be run before the beginning of any sweep. It marks all items as not-processed. * axis_train_op: A Tensorflow op, to be run during this axis' sweeps. """ processed_items_init = array_ops.fill(dims=[num_items], value=False) with ops.colocate_with(processed_items_init): processed_items = variable_scope.variable( processed_items_init, collections=[ops.GraphKeys.GLOBAL_VARIABLES], trainable=False, name="processed_" + axis_name) reset_processed_items_op = state_ops.assign( processed_items, processed_items_init, name="reset_processed_" + axis_name) _, update_op, loss, reg, sum_weights = update_fn(sp_input) input_indices = sp_input.indices[:, 0] with ops.control_dependencies([ update_op, state_ops.assign(loss_var, loss + reg), state_ops.assign(rwse_var, math_ops.sqrt(loss / sum_weights))]): with ops.colocate_with(processed_items): update_processed_items = state_ops.scatter_update( processed_items, input_indices, array_ops.ones_like(input_indices, dtype=dtypes.bool), name="update_processed_{}_indices".format(axis_name)) with ops.control_dependencies([update_processed_items]): is_sweep_done = math_ops.reduce_all(processed_items) axis_train_op = control_flow_ops.group( global_step_incr_op, state_ops.assign(is_sweep_done_var, is_sweep_done), state_ops.assign_add( completed_sweeps_var, math_ops.cast(is_sweep_done, dtypes.int32)), name="{}_sweep_train_op".format(axis_name)) return reset_processed_items_op, axis_train_op
def test_non_resource_variable_error(self): """Tests that non-resource variable usage is disallowed.""" a = variable_scope.get_variable( name='variable_a', shape=(1), use_resource=False) context = self.create_test_xla_compile_context() context.Enter() with self.assertRaisesRegexp( NotImplementedError, 'Non-resource Variables are not supported inside ' r'XLA computations \(operator name: Assign\)'): state_ops.assign(a, a + 1) context.Exit()
def _Update(variable, gradients, accum, linear, base_lr, lr_power, l1, l2): """Update "variable", "accum", "linear" based on "gradients". Some notations here: "variable" as W, "accum" as N, "linear" as Z, "gradients" as G, N(t) means "accum" at t-step. Assuming lr_power = -0.5 which means using adagrad learning rate. "accum" updates as: N = N + G^2 "linear" updates as: Z = Z + G - W * (sqrt(N(t)) - sqrt(N(t-1)))/base_lr REQUIRES: Dimensionality of variable, gradients, accum and linear must be same. Args: variable: A Variable. gradients: A Tensor of same shape as 'variable'. accum: A Variable containing the sum of the squares of gradients. linear: A Variable containing approximation info. base_lr: A constant represents base learning rate. lr_power: A constant is used to adjust learning rate. l1: A constant represents l1 regularization strength. l2: A constant represents l2 regularization strength. Returns: A group op including three Assign ops: 1. Assign for "accum" 2. Assign for "linear" 3. Assign for "variable" """ dtype = variable.dtype.base_dtype base_lr = ops.convert_to_tensor(base_lr, dtype=dtype) lr_power = ops.convert_to_tensor(lr_power, dtype=dtype) l1 = ops.convert_to_tensor(l1, dtype=dtype) l2 = ops.convert_to_tensor(l2, dtype=dtype) # Compute the new accumulator sqr_grad = math_ops.square(gradients) accum_updated = sqr_grad + accum # Compute the new linear neg_lr_power = math_ops.neg(lr_power) sigma = math_ops.pow(accum_updated, neg_lr_power) - math_ops.pow( accum, neg_lr_power) sigma /= base_lr proximal_adjust = sigma * variable linear_updated = linear + gradients - proximal_adjust # Compute the "variable" variable_updated = _Compute(accum_updated, linear_updated, base_lr, lr_power, l1, l2) with ops.control_dependencies([sigma]): accum_update_op = state_ops.assign(accum, accum_updated) linear_update_op = state_ops.assign(linear, linear_updated) variable_update_op = state_ops.assign(variable, variable_updated) group_op = control_flow_ops.group(linear_update_op, accum_update_op, variable_update_op) return group_op
def setUp(self): self._num_rows = 5 self._num_cols = 7 self._train_op = control_flow_ops.no_op() self._row_prep_done = variables.Variable(False) self._col_prep_done = variables.Variable(False) self._init_done = variables.Variable(False) self._row_prep_ops = [state_ops.assign(self._row_prep_done, True)] self._col_prep_ops = [state_ops.assign(self._col_prep_done, True)] self._init_ops = [state_ops.assign(self._init_done, True)] self._input_row_indices_ph = array_ops.placeholder(dtypes.int64) self._input_col_indices_ph = array_ops.placeholder(dtypes.int64)
def series_start_updates(): # If this is the lowest-time chunk that we have seen so far, update # series start moments to reflect that. Note that these statistics are # "best effort", as there are race conditions in the update (however, # they should eventually converge if the start of the series is # presented enough times). mean, variance = nn.moments( values[min_time_batch, :self._starting_variance_window_size], axes=[0]) return control_flow_ops.group( state_ops.assign(statistics.series_start_moments.mean, mean), state_ops.assign(statistics.series_start_moments.variance, variance))
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype) local_step = math_ops.cast(self.iterations + 1, var_dtype) beta_1_power = math_ops.pow(beta_1_t, local_step) beta_2_power = math_ops.pow(beta_2_t, local_step) if self._initial_total_steps > 0: total_steps = self._get_hyper('total_steps', var_dtype) warmup_steps = total_steps * self._get_hyper( 'warmup_proportion', var_dtype) min_lr = self._get_hyper('min_lr', var_dtype) decay_steps = K.maximum(total_steps - warmup_steps, 1) decay_rate = (min_lr - lr_t) / decay_steps lr_t = tf.where( local_step <= warmup_steps, lr_t * (local_step / warmup_steps), lr_t + decay_rate * K.minimum(local_step - warmup_steps, decay_steps), ) sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0 sma_t = sma_inf - 2.0 * local_step * beta_2_power / (1.0 - beta_2_power) m_t = state_ops.assign(m, beta_1_t * m + (1.0 - beta_1_t) * grad, use_locking=self._use_locking) m_corr_t = m_t / (1.0 - beta_1_power) v_t = state_ops.assign(v, beta_2_t * v + (1.0 - beta_2_t) * math_ops.square(grad), use_locking=self._use_locking) if self.amsgrad: vhat = self.get_slot(var, 'vhat') vhat_t = state_ops.assign(vhat, math_ops.maximum(vhat, v_t), use_locking=self._use_locking) v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta_2_power)) else: vhat_t = None v_corr_t = math_ops.sqrt(v_t / (1.0 - beta_2_power)) r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) / (sma_inf - 2.0) * sma_inf / sma_t) var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / (v_corr_t + epsilon_t), m_corr_t) if self._initial_weight_decay > 0.0: var_t += self._get_hyper('weight_decay', var_dtype) * var var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) updates = [var_update, m_t, v_t] if self.amsgrad: updates.append(vhat_t) return control_flow_ops.group(*updates)
def update_var_v1(x): v = variables.Variable(3, name='v') update_op = state_ops.assign(v, x).op return update_op
def _resource_apply_sparse(self, grad, var, indices, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) rms = self.get_slot(var, "rms") if self._momentum: mom = self.get_slot(var, "momentum") if self.centered: mg = self.get_slot(var, "mg") return training_ops.resource_sparse_apply_centered_rms_prop( var.handle, mg.handle, rms.handle, mom.handle, coefficients["lr_t"], coefficients["rho"], coefficients["momentum"], coefficients["epsilon"], grad, indices, use_locking=self._use_locking) else: return training_ops.resource_sparse_apply_rms_prop( var.handle, rms.handle, mom.handle, coefficients["lr_t"], coefficients["rho"], coefficients["momentum"], coefficients["epsilon"], grad, indices, use_locking=self._use_locking) else: rms_scaled_g_values = (grad * grad) * coefficients["one_minus_rho"] rms_t = state_ops.assign(rms, rms * coefficients["rho"], use_locking=self._use_locking) with ops.control_dependencies([rms_t]): rms_t = self._resource_scatter_add(rms, indices, rms_scaled_g_values) rms_slice = array_ops.gather(rms_t, indices) denom_slice = rms_slice if self.centered: mg = self.get_slot(var, "mg") mg_scaled_g_values = grad * coefficients["one_minus_rho"] mg_t = state_ops.assign(mg, mg * coefficients["rho"], use_locking=self._use_locking) with ops.control_dependencies([mg_t]): mg_t = self._resource_scatter_add(mg, indices, mg_scaled_g_values) mg_slice = array_ops.gather(mg_t, indices) denom_slice = rms_slice - math_ops.square(mg_slice) var_update = self._resource_scatter_add( var, indices, coefficients["neg_lr_t"] * grad / (math_ops.sqrt(denom_slice) + coefficients["epsilon"])) if self.centered: return control_flow_ops.group(*[var_update, rms_t, mg_t]) return control_flow_ops.group(*[var_update, rms_t])
def build_controller(self): """RL optimization interface. Returns: ops: A dictionary holding handles of the model used for training. """ self._global_step = training_util.get_or_create_global_step() ops = {} ops["loss"] = 0 failing_signal = self.compute_reward(self.hparams.failing_signal) ctr = {} with tf_ops.name_scope("controller_{}".format(self.ctrl_id)): with variable_scope.variable_scope("controller_{}".format( self.ctrl_id)): ctr["reward"] = {"value": [], "ph": [], "update": []} ctr["ready"] = {"value": [], "ph": [], "update": []} ctr["best_reward"] = {"value": [], "update": []} for i in range(self.hparams.num_children): reward_value = variable_scope.get_local_variable( "reward_{}".format(i), initializer=0.0, dtype=dtypes.float32, trainable=False) reward_ph = array_ops.placeholder( dtypes.float32, shape=(), name="reward_ph_{}".format(i)) reward_update = state_ops.assign(reward_value, reward_ph, use_locking=True) ctr["reward"]["value"].append(reward_value) ctr["reward"]["ph"].append(reward_ph) ctr["reward"]["update"].append(reward_update) best_reward = variable_scope.get_local_variable( "best_reward_{}".format(i), initializer=failing_signal, dtype=dtypes.float32, trainable=False) ctr["best_reward"]["value"].append(best_reward) ctr["best_reward"]["update"].append( state_ops.assign( best_reward, math_ops.minimum(best_reward, reward_update))) ready_value = variable_scope.get_local_variable( "ready_{}".format(i), initializer=True, dtype=dtypes.bool, trainable=False) ready_ph = array_ops.placeholder( dtypes.bool, shape=(), name="ready_ph_{}".format(i)) ready_update = state_ops.assign(ready_value, ready_ph, use_locking=True) ctr["ready"]["value"].append(ready_value) ctr["ready"]["ph"].append(ready_ph) ctr["ready"]["update"].append(ready_update) ctr["grouping_y_preds"], ctr[ "grouping_log_probs"] = self.get_groupings() summary.histogram( "grouping_actions", array_ops.slice(ctr["grouping_y_preds"]["sample"], [0, 0], [1, array_ops.shape(self.op_embeddings)[0]])) with variable_scope.variable_scope("controller_{}".format( self.ctrl_id)): ctr["baseline"] = variable_scope.get_local_variable( "baseline", initializer=failing_signal if self.hparams.start_with_failing_signal else 0.0, dtype=dtypes.float32, trainable=False) new_baseline = self.hparams.bl_dec * ctr["baseline"] + ( 1 - self.hparams.bl_dec) * math_ops.reduce_mean( ctr["reward"]["value"]) if not self.hparams.always_update_baseline: baseline_mask = math_ops.less(ctr["reward"]["value"], failing_signal) selected_reward = array_ops.boolean_mask( ctr["reward"]["value"], baseline_mask) selected_baseline = control_flow_ops.cond( math_ops.reduce_any(baseline_mask), lambda: math_ops.reduce_mean(selected_reward), lambda: constant_op.constant(0, dtype=dtypes.float32)) ctr["pos_reward"] = selected_baseline pos_ = math_ops.less( constant_op.constant(0, dtype=dtypes.float32), selected_baseline) selected_baseline = self.hparams.bl_dec * ctr["baseline"] + ( 1 - self.hparams.bl_dec) * selected_baseline selected_baseline = control_flow_ops.cond( pos_, lambda: selected_baseline, lambda: ctr["baseline"]) new_baseline = control_flow_ops.cond( math_ops.less(self.global_step, self.hparams.stop_updating_after_steps), lambda: new_baseline, lambda: selected_baseline) ctr["baseline_update"] = state_ops.assign(ctr["baseline"], new_baseline, use_locking=True) ctr["y_preds"], ctr["log_probs"] = self.get_placements() summary.histogram("actions", ctr["y_preds"]["sample"]) mask = math_ops.less(ctr["reward"]["value"], failing_signal) ctr["loss"] = ctr["reward"]["value"] - ctr["baseline"] ctr["loss"] *= (ctr["log_probs"]["sample"] + ctr["grouping_log_probs"]["sample"]) selected_loss = array_ops.boolean_mask(ctr["loss"], mask) selected_loss = control_flow_ops.cond( math_ops.reduce_any(mask), lambda: math_ops.reduce_mean(-selected_loss), lambda: constant_op.constant(0, dtype=dtypes.float32)) ctr["loss"] = control_flow_ops.cond( math_ops.less(self.global_step, self.hparams.stop_updating_after_steps), lambda: math_ops.reduce_mean(-ctr["loss"]), lambda: selected_loss) ctr["reward_s"] = math_ops.reduce_mean(ctr["reward"]["value"]) summary.scalar("loss", ctr["loss"]) summary.scalar("avg_reward", ctr["reward_s"]) summary.scalar("best_reward_so_far", best_reward) summary.scalar( "advantage", math_ops.reduce_mean(ctr["reward"]["value"] - ctr["baseline"])) with variable_scope.variable_scope("optimizer", reuse=variable_scope.AUTO_REUSE): (ctr["train_op"], ctr["lr"], ctr["grad_norm"], ctr["grad_norms"]) = self._get_train_ops( ctr["loss"], tf_ops.get_collection(tf_ops.GraphKeys.TRAINABLE_VARIABLES), self.global_step, grad_bound=self.hparams.grad_bound, lr_init=self.hparams.lr, lr_dec=self.hparams.lr_dec, start_decay_step=self.hparams.start_decay_step, decay_steps=self.hparams.decay_steps, optimizer_type=self.hparams.optimizer_type) summary.scalar("gradnorm", ctr["grad_norm"]) summary.scalar("lr", ctr["lr"]) ctr["summary"] = summary.merge_all() ops["controller"] = ctr self.ops = ops return ops
def _assign_new_value(self, variable, value): with K.name_scope('AssignNewValue') as scope: with ops.colocate_with(variable): return state_ops.assign(variable, value, name=scope)
def LastValueQuantize(inputs, per_channel=False, init_min=-6.0, init_max=6.0, vars_collection=None, name_prefix='LastValueQuant', reuse=None, is_training=True, num_bits=8, narrow_range=False, symmetric=False): """Adds a layer that collects quantization ranges as last input ranges. LastValueQuantize creates variables called 'min' and 'max', representing the interval used for quantization and clamping. Args: inputs: a tensor containing values to be quantized. per_channel: (Optional) a boolean specifying whether to use different quantization ranges per output channel. init_min: a float scalar, the initial value for variable min. init_max: a float scalar, the initial value for variable max. vars_collection: (Optional) collection where to store variables for quantization interval ends. name_prefix: name_prefix for created nodes. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. is_training: Whether the op is applied to a training or eval graph. num_bits: Number of bits to use for quantization, must be between 2 and 8. narrow_range: Whether to use the narrow quantization range [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1]. symmetric: If true, use symmetric quantization limits instead of training the minimum and maximum of each quantization range separately. Returns: a tensor containing quantized values. """ with variable_scope.variable_scope( None, default_name=name_prefix, values=[inputs], reuse=reuse) as scope: scope.set_partitioner(None) input_shape = inputs.get_shape() input_dim = len(input_shape) if per_channel: # Only support quantizing 1-, 2- and 4-dimensional tensors. assert input_dim in [1, 2, 4], ('Expected 1D, 2D or 4D input, was: %s in ' ' scope: %s' % (input_shape, name_prefix)) min_max_shape = [input_shape[-1]] else: min_max_shape = [] vars_collections = [vars_collection] if vars_collection else [] min_var = _ModelVariable( 'min', shape=min_max_shape, initializer=init_ops.constant_initializer(init_min), collections=vars_collections, trainable=False) max_var = _ModelVariable( 'max', shape=min_max_shape, initializer=init_ops.constant_initializer(init_max), collections=vars_collections, trainable=False) if not is_training: return _FakeQuantWithMinMaxVars( inputs, min_var, max_var, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range) if per_channel: if input_dim == 2: reduce_dims = [0] elif input_dim == 4: reduce_dims = [0, 1, 2] if per_channel: if input_dim >= 2: batch_min = math_ops.reduce_min( inputs, axis=reduce_dims, name='BatchMin') else: batch_min = inputs else: batch_min = math_ops.reduce_min(inputs, name='BatchMin') if per_channel: if input_dim >= 2: batch_max = math_ops.reduce_max( inputs, axis=reduce_dims, name='BatchMax') else: batch_max = inputs else: batch_max = math_ops.reduce_max(inputs, name='BatchMax') if symmetric: if narrow_range: min_max_ratio = -1 else: # In two's complement notation, the negative range is slightly larger # than the positive range. min_max_ratio = -((1 << num_bits) - 2) / (1 << num_bits) # TFLite requires that 0.0 if always in the [min; max] range. Because # batch_min <= batch_max, it follows that range_min <= 0 <= range_max. range_min = math_ops.minimum(batch_min, batch_max / min_max_ratio) range_max = math_ops.maximum(batch_max, batch_min * min_max_ratio) else: # TFLite requires that 0.0 if always in the [min; max] range. range_min = math_ops.minimum(batch_min, 0.0) range_max = math_ops.maximum(batch_max, 0.0) assign_min = state_ops.assign(min_var, range_min, name='AssignMinLast') assign_max = state_ops.assign(max_var, range_max, name='AssignMaxLast') return _FakeQuantWithMinMaxVars( inputs, assign_min, assign_max, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range)
def _warm_start_var_with_vocab(var, current_vocab_path, current_vocab_size, prev_ckpt, prev_vocab_path, previous_vocab_size=-1, current_oov_buckets=0, prev_tensor_name=None, initializer=None): """Warm-starts given variable from `prev_tensor_name` tensor in `prev_ckpt`. Use this method when the `var` is backed by vocabulary. This method stitches the given `var` such that values corresponding to individual features in the vocabulary remain consistent irrespective of changing order of the features between old and new vocabularies. Args: var: Current graph's variable that needs to be warm-started (initialized). Can be either of the following: (i) `Variable` (ii) `ResourceVariable` (iii) list of `Variable`: The list must contain slices of the same larger variable. (iv) `PartitionedVariable` current_vocab_path: Path to the vocab file used for the given `var`. current_vocab_size: An `int` specifying the number of entries in the current vocab. prev_ckpt: A string specifying the directory with checkpoint file(s) or path to checkpoint. The given checkpoint must have tensor with name `prev_tensor_name` (if not None) or tensor with name same as given `var`. prev_vocab_path: Path to the vocab file used for the tensor in `prev_ckpt`. previous_vocab_size: If provided, will constrain previous vocab to the first `previous_vocab_size` entries. -1 means use the entire previous vocab. current_oov_buckets: An `int` specifying the number of out-of-vocabulary buckets used for given `var`. prev_tensor_name: Name of the tensor to lookup in provided `prev_ckpt`. If None, we lookup tensor with same name as given `var`. initializer: Variable initializer to be used for missing entries. If None, missing entries will be zero-initialized. Raises: ValueError: If required args are not provided. """ if not (current_vocab_path and current_vocab_size and prev_ckpt and prev_vocab_path): raise ValueError("Invalid args: Must provide all of [current_vocab_path, " "current_vocab_size, prev_ckpt, prev_vocab_path}.") if _is_variable(var): var = [var] elif isinstance(var, list) and all(_is_variable(v) for v in var): var = var elif isinstance(var, variables_lib.PartitionedVariable): var = var._get_variable_list() else: raise TypeError( "var MUST be one of the following: a Variable, list of Variable or " "PartitionedVariable, but is {}".format(type(var))) if not prev_tensor_name: # Assume tensor name remains the same. prev_tensor_name = _infer_var_name(var) for v in var: v_shape = v.get_shape().as_list() slice_info = v._get_save_slice_info() partition_info = None if slice_info: partition_info = variable_scope._PartitionInfo( full_shape=slice_info.full_shape, var_offset=slice_info.var_offset) # TODO(eddz): Support WarmStartSettings where class vocabularies need # remapping too. init = checkpoint_ops._load_and_remap_matrix_initializer( ckpt_path=checkpoint_utils._get_checkpoint_filename(prev_ckpt), old_tensor_name=prev_tensor_name, new_row_vocab_size=current_vocab_size, new_col_vocab_size=v_shape[1], old_row_vocab_size=previous_vocab_size, old_row_vocab_file=prev_vocab_path, new_row_vocab_file=current_vocab_path, old_col_vocab_file=None, new_col_vocab_file=None, num_row_oov_buckets=current_oov_buckets, num_col_oov_buckets=0, initializer=initializer) new_init_val = ops.convert_to_tensor( init(shape=v_shape, partition_info=partition_info)) v._initializer_op = state_ops.assign(v, new_init_val)
def streaming_tp_fp_arrays(self, num_gbboxes, tp, fp, scores, remove_zero_scores=True, metrics_collections=None, updates_collections=None, name=None): # Input dictionaries: dict outputs as streaming metrics. if isinstance(scores, dict) or isinstance(fp, dict): d_values = {} d_update_ops = {} for c in num_gbboxes.keys(): scope = 'streaming_tp_fp_%s' % c v, up = self.streaming_tp_fp_arrays(num_gbboxes[c], tp[c], fp[c], scores[c], remove_zero_scores, metrics_collections, updates_collections, name=scope) d_values[c] = v d_update_ops[c] = up return d_values, d_update_ops # Input Tensors... with variable_scope.variable_scope(name, 'streaming_tp_fp', [num_gbboxes, tp, fp, scores]): num_gbboxes = math_ops.to_int64(num_gbboxes) scores = math_ops.to_float(scores) stype = tf.bool tp = tf.cast(tp, stype) fp = tf.cast(fp, stype) # Reshape TP and FP tensors and clean away 0 class values. scores = tf.reshape(scores, [-1]) tp = tf.reshape(tp, [-1]) fp = tf.reshape(fp, [-1]) # Remove TP and FP both false. mask = tf.logical_or(tp, fp) if remove_zero_scores: rm_threshold = 1e-4 mask = tf.logical_and(mask, tf.greater(scores, rm_threshold)) scores = tf.boolean_mask(scores, mask) tp = tf.boolean_mask(tp, mask) fp = tf.boolean_mask(fp, mask) # Local variables accumlating information over batches. v_nobjects = self._create_local('v_num_gbboxes', shape=[], dtype=tf.int64) v_ndetections = self._create_local('v_num_detections', shape=[], dtype=tf.int32) v_scores = self._create_local('v_scores', shape=[ 0, ]) v_tp = self._create_local('v_tp', shape=[ 0, ], dtype=stype) v_fp = self._create_local('v_fp', shape=[ 0, ], dtype=stype) # Update operations. nobjects_op = state_ops.assign_add(v_nobjects, tf.reduce_sum(num_gbboxes)) ndetections_op = state_ops.assign_add( v_ndetections, tf.size(scores, out_type=tf.int32)) scores_op = state_ops.assign(v_scores, tf.concat([v_scores, scores], axis=0), validate_shape=False) tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp], axis=0), validate_shape=False) fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp], axis=0), validate_shape=False) # Value and update ops. val = (v_nobjects, v_ndetections, v_tp, v_fp, v_scores) with ops.control_dependencies( [nobjects_op, ndetections_op, scores_op, tp_op, fp_op]): update_op = (nobjects_op, ndetections_op, tp_op, fp_op, scores_op) if metrics_collections: ops.add_to_collections(metrics_collections, val) if updates_collections: ops.add_to_collections(updates_collections, update_op) return val, update_op
def call(self, inputs, mask=None, training=None, initial_state=None): # The input should be dense, padded with zeros. If a ragged input is fed # into the layer, it is padded and the row lengths are used for masking. inputs, row_lengths = K.convert_inputs_if_ragged(inputs) is_ragged_input = (row_lengths is not None) self._validate_args_if_ragged(is_ragged_input, mask) # LSTM does not support constants. Ignore it during process. inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None) if isinstance(mask, list): mask = mask[0] input_shape = K.int_shape(inputs) timesteps = input_shape[0] if self.time_major else input_shape[1] if not self._could_use_gpu_kernel: # Fall back to use the normal LSTM. kwargs = {'training': training} self._maybe_reset_cell_dropout_mask(self.cell) def step(inputs, states): return self.cell(inputs, states, **kwargs) last_output, outputs, states = K.rnn( step, inputs, initial_state, constants=None, go_backwards=self.go_backwards, mask=mask, unroll=self.unroll, input_length=row_lengths if row_lengths is not None else timesteps, time_major=self.time_major, zero_output_for_mask=self.zero_output_for_mask) runtime = _runtime(_RUNTIME_UNKNOWN) else: # Use the new defun approach for backend implementation swap. # Note that different implementations need to have same function # signature, eg, the tensor parameters need to have same shape and dtypes. # Since the CuDNN has an extra set of bias, those bias will be passed to # both normal and CuDNN implementations. self.reset_dropout_mask() dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4) if dropout_mask is not None: inputs = inputs * dropout_mask[0] gpu_lstm_kwargs = { 'inputs': inputs, 'init_h': initial_state[0], 'init_c': initial_state[1], 'kernel': self.cell.kernel, 'recurrent_kernel': self.cell.recurrent_kernel, 'bias': self.cell.bias, 'mask': mask, 'time_major': self.time_major, 'go_backwards': self.go_backwards, 'sequence_lengths': row_lengths } normal_lstm_kwargs = gpu_lstm_kwargs.copy() normal_lstm_kwargs.update({ 'activation': self.activation, 'recurrent_activation': self.recurrent_activation, 'zero_output_for_mask': self.zero_output_for_mask, }) if context.executing_eagerly(): device_type = _get_context_device_type() can_use_gpu = ( # Either user specified GPU or unspecified but GPU is available. (device_type == _GPU_DEVICE_NAME or (device_type is None and context.num_gpus() > 0)) and (mask is None or is_sequence_right_padded(mask, self.time_major))) # Under eager context, check the device placement and prefer the # GPU implementation when GPU is available. if can_use_gpu: last_output, outputs, new_h, new_c, runtime = gpu_lstm( **gpu_lstm_kwargs) else: last_output, outputs, new_h, new_c, runtime = standard_lstm( **normal_lstm_kwargs) else: (last_output, outputs, new_h, new_c, runtime) = lstm_with_backend_selection(**normal_lstm_kwargs) states = [new_h, new_c] if self.stateful: updates = [] for i in range(len(states)): updates.append(state_ops.assign(self.states[i], states[i])) self.add_update(updates) if self.return_sequences: output = K.maybe_convert_to_ragged(is_ragged_input, outputs, row_lengths) else: output = last_output if self.return_state: return [output] + list(states) elif self.return_runtime: return output, runtime else: return output
def _apply_sparse_shared(self, grad, var, indices, scatter_add): step, beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) if self._initial_total_steps > 0: total_steps = math_ops.cast(self._total_steps_t, var.dtype.base_dtype) warmup_proportion = math_ops.cast(self._warmup_proportion_t, var.dtype.base_dtype) min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype) warmup_steps = total_steps * warmup_proportion decay_steps = math_ops.maximum(total_steps - warmup_steps, 1) decay_rate = (min_lr - lr_t) / decay_steps lr_t = tf.where( step <= warmup_steps, lr_t * (step / warmup_steps), lr_t + decay_rate * math_ops.minimum(step - warmup_steps, decay_steps), ) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) sma_inf = 2.0 / (1.0 - beta2_t) - 1.0 sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power) m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = scatter_add(m, indices, m_scaled_g_values) m_corr_t = m_t / (1.0 - beta1_power) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = scatter_add(v, indices, v_scaled_g_values) if self._amsgrad: vhat = self.get_slot(var, 'vhat') vhat_t = state_ops.assign(vhat, math_ops.maximum(vhat, v_t), use_locking=self._use_locking) v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta2_power)) else: v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power)) r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) / (sma_inf - 2.0) * sma_inf / sma_t) var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / (v_corr_t + epsilon_t), m_corr_t) if self._initial_weight_decay > 0.0: var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var var_t = lr_t * var_t var_update = state_ops.scatter_sub( var, indices, array_ops.gather(var_t, indices), use_locking=self._use_locking) updates = [var_update, m_t, v_t] if self._amsgrad: updates.append(vhat_t) return control_flow_ops.group(*updates)
def _testSaveRestoreOutput(self, rnn_mode, direction, dtype): with ops.Graph().as_default(): num_layers = 2 num_units = 7 input_size = 7 seq_length = 10 batch_size = 5 dir_count = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2 model = _CreateModel(rnn_mode, num_layers, num_units, input_size, direction=direction, dtype=dtype) params_size_t = model.params_size() params = variables.VariableV1(array_ops.ones([params_size_t], dtype=dtype), validate_shape=False, dtype=dtype) _CreateParamsSavable(params, model) save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) np.random.seed(1234) has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM) input_data = constant_op.constant(np.random.randn( seq_length, batch_size, input_size), dtype=dtype) input_h = constant_op.constant(np.random.randn( num_layers * dir_count, batch_size, num_units), dtype=dtype) if has_input_c: input_c = constant_op.constant(np.random.randn( num_layers * dir_count, batch_size, num_units), dtype=dtype) outputs = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params, is_training=False) else: outputs = model(input_data=input_data, input_h=input_h, params=params, is_training=False) total_sum = sum(map(math_ops.reduce_sum, outputs)) # Passing graph explicitly, otherwise an old sess would be reused. with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) total_sum_v = sess.run(total_sum) val = saver.save(sess, save_path) self.assertEqual(save_path, val) # Passing graph explicitly, otherwise an old sess would be reused. with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: reset_params = state_ops.assign(params, array_ops.zeros( [params_size_t], dtype=dtype), validate_shape=False) sess.run(reset_params) saver.restore(sess, save_path) total_sum_v_restored = sess.run(total_sum) self.assertAllClose(total_sum_v, total_sum_v_restored, atol=1e-5)
def _wals_factorization_model_function(features, labels, mode, params): """Model function for the WALSFactorization estimator. Args: features: Dictionary of features. See WALSMatrixFactorization. labels: Must be None. mode: A model_fn.ModeKeys object. params: Dictionary of parameters containing arguments passed to the WALSMatrixFactorization constructor. Returns: A ModelFnOps object. Raises: ValueError: If `mode` is not recognized. """ assert labels is None use_factors_weights_cache = (params["use_factors_weights_cache_for_training"] and mode == model_fn.ModeKeys.TRAIN) use_gramian_cache = (params["use_gramian_cache_for_training"] and mode == model_fn.ModeKeys.TRAIN) max_sweeps = params["max_sweeps"] model = factorization_ops.WALSModel( params["num_rows"], params["num_cols"], params["embedding_dimension"], unobserved_weight=params["unobserved_weight"], regularization=params["regularization_coeff"], row_init=params["row_init"], col_init=params["col_init"], num_row_shards=params["num_row_shards"], num_col_shards=params["num_col_shards"], row_weights=params["row_weights"], col_weights=params["col_weights"], use_factors_weights_cache=use_factors_weights_cache, use_gramian_cache=use_gramian_cache) # Get input rows and cols. We either update rows or columns depending on # the value of row_sweep, which is maintained using a session hook. input_rows = features[WALSMatrixFactorization.INPUT_ROWS] input_cols = features[WALSMatrixFactorization.INPUT_COLS] # TRAIN mode: if mode == model_fn.ModeKeys.TRAIN: # Training consists of the following ops (controlled using a SweepHook). # Before a row sweep: # row_update_prep_gramian_op # initialize_row_update_op # During a row sweep: # update_row_factors_op # Before a col sweep: # col_update_prep_gramian_op # initialize_col_update_op # During a col sweep: # update_col_factors_op is_row_sweep_var = variable_scope.variable( True, trainable=False, name="is_row_sweep", collections=[ops.GraphKeys.GLOBAL_VARIABLES]) is_sweep_done_var = variable_scope.variable( False, trainable=False, name="is_sweep_done", collections=[ops.GraphKeys.GLOBAL_VARIABLES]) completed_sweeps_var = variable_scope.variable( 0, trainable=False, name=WALSMatrixFactorization.COMPLETED_SWEEPS, collections=[ops.GraphKeys.GLOBAL_VARIABLES]) loss_var = variable_scope.variable( 0., trainable=False, name=WALSMatrixFactorization.LOSS, collections=[ops.GraphKeys.GLOBAL_VARIABLES]) # The root weighted squared error = # \\(\sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )\\) rwse_var = variable_scope.variable( 0., trainable=False, name=WALSMatrixFactorization.RWSE, collections=[ops.GraphKeys.GLOBAL_VARIABLES]) summary.scalar("loss", loss_var) summary.scalar("root_weighted_squared_error", rwse_var) summary.scalar("completed_sweeps", completed_sweeps_var) def create_axis_ops(sp_input, num_items, update_fn, axis_name): """Creates book-keeping and training ops for a given axis. Args: sp_input: A SparseTensor corresponding to the row or column batch. num_items: An integer, the total number of items of this axis. update_fn: A function that takes one argument (`sp_input`), and that returns a tuple of * new_factors: A float Tensor of the factor values after update. * update_op: a TensorFlow op which updates the factors. * loss: A float Tensor, the unregularized loss. * reg_loss: A float Tensor, the regularization loss. * sum_weights: A float Tensor, the sum of factor weights. axis_name: A string that specifies the name of the axis. Returns: A tuple consisting of: * reset_processed_items_op: A TensorFlow op, to be run before the beginning of any sweep. It marks all items as not-processed. * axis_train_op: A Tensorflow op, to be run during this axis' sweeps. """ processed_items_init = array_ops.fill(dims=[num_items], value=False) with ops.colocate_with(processed_items_init): processed_items = variable_scope.variable( processed_items_init, collections=[ops.GraphKeys.GLOBAL_VARIABLES], trainable=False, name="processed_" + axis_name) _, update_op, loss, reg, sum_weights = update_fn(sp_input) input_indices = sp_input.indices[:, 0] with ops.control_dependencies([ update_op, state_ops.assign(loss_var, loss + reg), state_ops.assign(rwse_var, math_ops.sqrt(loss / sum_weights))]): with ops.colocate_with(processed_items): update_processed_items = state_ops.scatter_update( processed_items, input_indices, array_ops.ones_like(input_indices, dtype=dtypes.bool), name="update_processed_{}_indices".format(axis_name)) with ops.control_dependencies([update_processed_items]): is_sweep_done = math_ops.reduce_all(processed_items) axis_train_op = control_flow_ops.group( state_ops.assign(is_sweep_done_var, is_sweep_done), state_ops.assign_add( completed_sweeps_var, math_ops.cast(is_sweep_done, dtypes.int32)), name="{}_sweep_train_op".format(axis_name)) return processed_items.initializer, axis_train_op reset_processed_rows_op, row_train_op = create_axis_ops( input_rows, params["num_rows"], lambda x: model.update_row_factors(sp_input=x, transpose_input=False), "rows") reset_processed_cols_op, col_train_op = create_axis_ops( input_cols, params["num_cols"], lambda x: model.update_col_factors(sp_input=x, transpose_input=True), "cols") switch_op = control_flow_ops.group( state_ops.assign( is_row_sweep_var, math_ops.logical_not(is_row_sweep_var)), reset_processed_rows_op, reset_processed_cols_op, name="sweep_switch_op") row_prep_ops = [ model.row_update_prep_gramian_op, model.initialize_row_update_op] col_prep_ops = [ model.col_update_prep_gramian_op, model.initialize_col_update_op] init_op = model.worker_init sweep_hook = _SweepHook( is_row_sweep_var, is_sweep_done_var, init_op, row_prep_ops, col_prep_ops, row_train_op, col_train_op, switch_op) global_step_hook = _IncrementGlobalStepHook() training_hooks = [sweep_hook, global_step_hook] if max_sweeps is not None: training_hooks.append(_StopAtSweepHook(max_sweeps)) return model_fn.ModelFnOps( mode=model_fn.ModeKeys.TRAIN, predictions={}, loss=loss_var, eval_metric_ops={}, train_op=control_flow_ops.no_op(), training_hooks=training_hooks) # INFER mode elif mode == model_fn.ModeKeys.INFER: projection_weights = features.get( WALSMatrixFactorization.PROJECTION_WEIGHTS) def get_row_projection(): return model.project_row_factors( sp_input=input_rows, projection_weights=projection_weights, transpose_input=False) def get_col_projection(): return model.project_col_factors( sp_input=input_cols, projection_weights=projection_weights, transpose_input=True) predictions = { WALSMatrixFactorization.PROJECTION_RESULT: control_flow_ops.cond( features[WALSMatrixFactorization.PROJECT_ROW], get_row_projection, get_col_projection) } return model_fn.ModelFnOps( mode=model_fn.ModeKeys.INFER, predictions=predictions, loss=None, eval_metric_ops={}, train_op=control_flow_ops.no_op(), training_hooks=[]) # EVAL mode elif mode == model_fn.ModeKeys.EVAL: def get_row_loss(): _, _, loss, reg, _ = model.update_row_factors( sp_input=input_rows, transpose_input=False) return loss + reg def get_col_loss(): _, _, loss, reg, _ = model.update_col_factors( sp_input=input_cols, transpose_input=True) return loss + reg loss = control_flow_ops.cond( features[WALSMatrixFactorization.PROJECT_ROW], get_row_loss, get_col_loss) return model_fn.ModelFnOps( mode=model_fn.ModeKeys.EVAL, predictions={}, loss=loss, eval_metric_ops={}, train_op=control_flow_ops.no_op(), training_hooks=[]) else: raise ValueError("mode=%s is not recognized." % str(mode))
def _init_from_args(self, initial_value=None, trainable=True, collections=None, validate_shape=True, caching_device=None, name=None, dtype=None, expected_shape=None): """Creates a new variable from arguments. Args: initial_value: A `Tensor`, or Python object convertible to a `Tensor`, which is the initial value for the Variable. The initial value must have a shape specified unless `validate_shape` is set to False. Can also be a callable with no argument that returns the initial value when called. (Note that initializer functions from init_ops.py must first be bound to a shape before being used here.) trainable: If `True`, the default, also adds the variable to the graph collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as the default list of variables to use by the `Optimizer` classes. collections: List of graph collections keys. The new variable is added to these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`. validate_shape: If `False`, allows the variable to be initialized with a value of unknown shape. If `True`, the default, the shape of `initial_value` must be known. caching_device: Optional device string or function describing where the Variable should be cached for reading. Defaults to the Variable's device. If not `None`, caches on another device. Typical use is to cache on the device where the Ops using the Variable reside, to deduplicate copying through `Switch` and other conditional statements. name: Optional name for the variable. Defaults to `'Variable'` and gets uniquified automatically. dtype: If set, initial_value will be converted to the given type. If None, either the datatype will be kept (if initial_value is a Tensor) or float32 will be used (if it is a Python object convertible to a Tensor). expected_shape: Deprecated. Ignored. Raises: ValueError: If the initial value is not specified, or does not have a shape and `validate_shape` is `True`. """ _ = expected_shape if initial_value is None: raise ValueError("initial_value must be specified.") init_from_fn = callable(initial_value) if collections is None: collections = [ops.GraphKeys.GLOBAL_VARIABLES] if not isinstance(collections, (list, tuple, set)): raise ValueError( "collections argument to Variable constructor must be a list, tuple, " "or set. Got %s of type %s" % (collections, type(collections))) if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections: collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES] with ops.control_dependencies(None): with ops.name_scope(name, "Variable", [] if init_from_fn else [initial_value]) as name: if init_from_fn: # Use attr_scope and device(None) to simulate the behavior of # colocate_with when the variable we want to colocate with doesn't # yet exist. true_name = ops._name_from_scope_name(name) attr = attr_value_pb2.AttrValue( list=attr_value_pb2.AttrValue.ListValue( s=[compat.as_bytes("loc:@%s" % true_name)])) # pylint: disable=protected-access with ops.get_default_graph()._attr_scope({"_class": attr}): with ops.name_scope("Initializer"), ops.device(None): self._initial_value = ops.convert_to_tensor( initial_value(), name="initial_value", dtype=dtype) shape = (self._initial_value.get_shape() if validate_shape else tensor_shape.unknown_shape()) self._variable = state_ops.variable_op_v2( shape, self._initial_value.dtype.base_dtype, name=name) # Or get the initial value from a Tensor or Python object. else: self._initial_value = ops.convert_to_tensor( initial_value, name="initial_value", dtype=dtype) shape = (self._initial_value.get_shape() if validate_shape else tensor_shape.unknown_shape()) # In this case, the variable op can't be created until after the # initial_value has been converted to a Tensor with a known type. self._variable = state_ops.variable_op_v2( shape, self._initial_value.dtype.base_dtype, name=name) # Manually overrides the variable's shape with the initial value's. if validate_shape: initial_value_shape = self._initial_value.get_shape() if not initial_value_shape.is_fully_defined(): raise ValueError("initial_value must have a shape specified: %s" % self._initial_value) # Assigns initial value. self._initializer_op = state_ops.assign( self._variable, self._initial_value, validate_shape=validate_shape).op # TODO(vrv): Change this class to not take caching_device, but # to take the op to colocate the snapshot with, so we can use # colocation rather than devices. if caching_device is not None: with ops.device(caching_device): self._snapshot = array_ops.identity(self._variable, name="read") else: with ops.colocate_with(self._variable.op): self._snapshot = array_ops.identity(self._variable, name="read") ops.add_to_collections(collections, self) self._caching_device = caching_device self._save_slice_info = None
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = array_ops.identity(self._get_hyper('learning_rate', var_dtype)) beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype)) epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') local_step = math_ops.cast(self.iterations + 1, var_dtype) next_step = math_ops.cast(self.iterations + 2, var_dtype) decay_base = math_ops.cast(0.96, var_dtype) # Learning rate multipliers if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t, var) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = beta_1_t * ( 1. - 0.5 * (math_ops.pow(decay_base, self._initial_decay * local_step))) momentum_cache_t_1 = beta_1_t * ( 1. - 0.5 * (math_ops.pow(decay_base, self._initial_decay * next_step))) m_schedule_new = math_ops.cast(self._m_cache_read, var_dtype) * momentum_cache_t if var_dtype is self._m_cache.dtype: m_schedule_new = array_ops.identity( state_ops.assign(self._m_cache, m_schedule_new, use_locking=self._use_locking)) m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] g_prime = grad / (1. - m_schedule_new) m_t = beta_1_t * m + (1. - beta_1_t) * grad m_t_prime = m_t / (1. - m_schedule_next) v_t = beta_2_t * v + (1. - beta_2_t) * math_ops.square(grad) v_t_prime = v_t / (1. - math_ops.pow(beta_2_t, local_step)) m_t_bar = (1. - momentum_cache_t) * g_prime + (momentum_cache_t * m_t_prime) m_t = state_ops.assign(m, m_t, use_locking=self._use_locking) v_t = state_ops.assign(v, v_t, use_locking=self._use_locking) var_t = math_ops.sub( var, self.eta_t * lr_t * m_t_bar / (math_ops.sqrt(v_t_prime + epsilon_t))) # Weight decays if var.name in self.weight_decays.keys(): var_t = _apply_weight_decays(self, var, var_t) var_update = state_ops.assign(var, var_t, use_locking=self._use_locking) # Cosine annealing (iteration_done, t_cur_update, eta_t_update) = _update_t_cur_eta_t_v2(self, lr_t, var) if iteration_done and not self._init_notified: self._init_notified = True updates = [var_update, m_t, v_t] if iteration_done: updates += [t_cur_update] if self.use_cosine_annealing and iteration_done: updates += [eta_t_update] return control_flow_ops.group(*updates)
def _projection_op(self, state, name=None): with ops.colocate_with(state): return state_ops.assign( state, _project_stochastic_matrix_wrt_euclidean_norm(state), name=name)
def testDeferredSlotRestoration(self): checkpoint_directory = self.get_temp_dir() root = trackable_utils.Checkpoint() root.var = trackable_utils.add_variable(root, name="var", initializer=0.) optimizer = adam.AdamOptimizer(0.1) if context.executing_eagerly(): optimizer.minimize(root.var.read_value) else: train_op = optimizer.minimize(root.var) # Note that `optimizer` has not been added as a dependency of # `root`. Create a one-off grouping so that slot variables for `root.var` # get initialized too. self.evaluate( trackable_utils.gather_initializers( trackable_utils.Checkpoint(root=root, optimizer=optimizer))) self.evaluate(train_op) self.evaluate(state_ops.assign(root.var, 12.)) no_slots_path = root.save( os.path.join(checkpoint_directory, "no_slots")) root.optimizer = optimizer self.evaluate(state_ops.assign(root.var, 13.)) self.evaluate( state_ops.assign(optimizer.get_slot(name="m", var=root.var), 14.)) slots_path = root.save(os.path.join(checkpoint_directory, "with_slots")) new_root = trackable_utils.Checkpoint() # Load the slot-containing checkpoint (deferred), then immediately overwrite # the non-slot variable (also deferred). slot_status = new_root.restore(slots_path) no_slot_status = new_root.restore(no_slots_path) with self.assertRaises(AssertionError): no_slot_status.assert_consumed() new_root.var = trackable_utils.add_variable(new_root, name="var", shape=[]) no_slot_status.assert_consumed() no_slot_status.run_restore_ops() self.assertEqual(12., self.evaluate(new_root.var)) new_root.optimizer = adam.AdamOptimizer(0.1) slot_status.assert_existing_objects_matched() with self.assertRaisesRegexp(AssertionError, "beta1_power"): slot_status.assert_consumed() self.assertEqual(12., self.evaluate(new_root.var)) if context.executing_eagerly(): # Slot variables are only created with restoring initializers when # executing eagerly. self.assertEqual( 14., self.evaluate( new_root.optimizer.get_slot(name="m", var=new_root.var))) else: self.assertIs( new_root.optimizer.get_slot(name="m", var=new_root.var), None) if context.executing_eagerly(): new_root.optimizer.minimize(new_root.var.read_value) else: train_op = new_root.optimizer.minimize(new_root.var) # The slot variable now exists; restore() didn't create it, but we should # now have a restore op for it. slot_status.run_restore_ops() self.assertEqual( 14., self.evaluate( new_root.optimizer.get_slot(name="m", var=new_root.var))) self.evaluate(train_op) slot_status.assert_consumed()
def doBasicsOneExportPath(self, export_path, clear_devices=False, global_step=GLOBAL_STEP, sharded=True, export_count=1): # Build a graph with 2 parameter nodes on different devices. ops.reset_default_graph() with session.Session(target="", config=config_pb2.ConfigProto( device_count={"CPU": 2})) as sess: # v2 is an unsaved variable derived from v0 and v1. It is used to # exercise the ability to run an init op when restoring a graph. with sess.graph.device("/cpu:0"): v0 = variables.Variable(10, name="v0") with sess.graph.device("/cpu:1"): v1 = variables.Variable(20, name="v1") v2 = variables.Variable(1, name="v2", trainable=False, collections=[]) assign_v2 = state_ops.assign(v2, math_ops.add(v0, v1)) init_op = control_flow_ops.group(assign_v2, name="init_op") ops.add_to_collection("v", v0) ops.add_to_collection("v", v1) ops.add_to_collection("v", v2) named_tensor_bindings = { "logical_input_A": v0, "logical_input_B": v1 } signatures = { "foo": exporter.regression_signature(input_tensor=v0, output_tensor=v1), "generic": exporter.generic_signature(named_tensor_bindings) } asset_filepath_orig = os.path.join(test.get_temp_dir(), "hello42.txt") asset_file = constant_op.constant(asset_filepath_orig, name="filename42") ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, asset_file) with gfile.FastGFile(asset_filepath_orig, "w") as f: f.write("your data here") assets_collection = ops.get_collection( ops.GraphKeys.ASSET_FILEPATHS) ignored_asset = os.path.join(test.get_temp_dir(), "ignored.txt") with gfile.FastGFile(ignored_asset, "w") as f: f.write("additional data here") variables.global_variables_initializer().run() # Run an export. save = saver.Saver({ "v0": v0, "v1": v1 }, restore_sequentially=True, sharded=sharded, write_version=saver_pb2.SaverDef.V1) export = exporter.Exporter(save) compare_def = ops.get_default_graph().as_graph_def() export.init( compare_def, init_op=init_op, clear_devices=clear_devices, default_graph_signature=exporter.classification_signature( input_tensor=v0), named_graph_signatures=signatures, assets_collection=assets_collection) for x in range(export_count): export.export(export_path, constant_op.constant(global_step + x), sess, exports_to_keep=gc.largest_export_versions(2)) # Set global_step to the last exported version, as the rest of the test # uses it to construct model export path, loads model from it, and does # verifications. We want to make sure to always use the last exported # version, as old ones may have be garbage-collected. global_step += export_count - 1 # Restore graph. ops.reset_default_graph() with session.Session(target="", config=config_pb2.ConfigProto( device_count={"CPU": 2})) as sess: save = saver.import_meta_graph( os.path.join(export_path, constants.VERSION_FORMAT_SPECIFIER % global_step, constants.META_GRAPH_DEF_FILENAME)) self.assertIsNotNone(save) meta_graph_def = save.export_meta_graph() collection_def = meta_graph_def.collection_def # Validate custom graph_def. graph_def_any = collection_def[constants.GRAPH_KEY].any_list.value self.assertEquals(len(graph_def_any), 1) graph_def = graph_pb2.GraphDef() graph_def_any[0].Unpack(graph_def) if clear_devices: for node in compare_def.node: node.device = "" self.assertProtoEquals(compare_def, graph_def) # Validate init_op. init_ops = collection_def[constants.INIT_OP_KEY].node_list.value self.assertEquals(len(init_ops), 1) self.assertEquals(init_ops[0], "init_op") # Validate signatures. signatures_any = collection_def[ constants.SIGNATURES_KEY].any_list.value self.assertEquals(len(signatures_any), 1) signatures = manifest_pb2.Signatures() signatures_any[0].Unpack(signatures) default_signature = signatures.default_signature self.assertEqual( default_signature.classification_signature.input.tensor_name, "v0:0") bindings = signatures.named_signatures[ "generic"].generic_signature.map self.assertEquals(bindings["logical_input_A"].tensor_name, "v0:0") self.assertEquals(bindings["logical_input_B"].tensor_name, "v1:0") read_foo_signature = ( signatures.named_signatures["foo"].regression_signature) self.assertEquals(read_foo_signature.input.tensor_name, "v0:0") self.assertEquals(read_foo_signature.output.tensor_name, "v1:0") # Validate the assets. assets_any = collection_def[constants.ASSETS_KEY].any_list.value self.assertEquals(len(assets_any), 1) asset = manifest_pb2.AssetFile() assets_any[0].Unpack(asset) assets_path = os.path.join( export_path, constants.VERSION_FORMAT_SPECIFIER % global_step, constants.ASSETS_DIRECTORY, "hello42.txt") asset_contents = gfile.GFile(assets_path).read() self.assertEqual(asset_contents, b"your data here") self.assertEquals("hello42.txt", asset.filename) self.assertEquals("filename42:0", asset.tensor_binding.tensor_name) ignored_asset_path = os.path.join( export_path, constants.VERSION_FORMAT_SPECIFIER % global_step, constants.ASSETS_DIRECTORY, "ignored.txt") self.assertFalse(gfile.Exists(ignored_asset_path)) # Validate graph restoration. if sharded: save.restore( sess, os.path.join( export_path, constants.VERSION_FORMAT_SPECIFIER % global_step, constants.VARIABLES_FILENAME_PATTERN)) else: save.restore( sess, os.path.join( export_path, constants.VERSION_FORMAT_SPECIFIER % global_step, constants.VARIABLES_FILENAME)) self.assertEqual(10, ops.get_collection("v")[0].eval()) self.assertEqual(20, ops.get_collection("v")[1].eval()) ops.get_collection(constants.INIT_OP_KEY)[0].run() self.assertEqual(30, ops.get_collection("v")[2].eval())
def testDeferredSlotRestoration(self): with self.test_session(): checkpoint_directory = self.get_temp_dir() root = trackable_utils.Checkpoint() root.var = trackable_utils.add_variable(root, name="var", initializer=0.) optimizer = adam.Adam(0.1) variables = [root.var] gradients = [1.] train_op = optimizer.apply_gradients(zip(gradients, variables)) # Note that `optimizer` has not been added as a dependency of # `root`. Create a one-off grouping so that slot variables for `root.var` # get initialized too. self.evaluate( trackable_utils.gather_initializers( trackable_utils.Checkpoint(root=root, optimizer=optimizer))) self.evaluate(train_op) self.evaluate(state_ops.assign(root.var, 12.)) no_slots_path = root.save( os.path.join(checkpoint_directory, "no_slots")) root.optimizer = optimizer self.evaluate(state_ops.assign(root.var, 13.)) self.evaluate( state_ops.assign( optimizer.get_slot(slot_name="m", var=root.var), 14.)) slots_path = root.save( os.path.join(checkpoint_directory, "with_slots")) new_root = trackable_utils.Checkpoint() # Load the slot-containing checkpoint (deferred), then immediately # overwrite the non-slot variable (also deferred). slot_status = new_root.restore(slots_path) no_slot_status = new_root.restore(no_slots_path) with self.assertRaises(AssertionError): no_slot_status.assert_consumed() new_root.var = trackable_utils.add_variable(new_root, name="var", shape=[]) no_slot_status.assert_consumed() no_slot_status.run_restore_ops() self.assertEqual(12., self.evaluate(new_root.var)) new_root.optimizer = adam.Adam(0.1) slot_status.assert_existing_objects_matched() if not context.executing_eagerly(): with self.assertRaisesRegex(AssertionError, "Unresolved object"): slot_status.assert_consumed() self.assertEqual(12., self.evaluate(new_root.var)) if context.executing_eagerly(): # Slot variables are only created with restoring initializers when # executing eagerly. self.assertEqual( 14., self.evaluate( new_root.optimizer.get_slot(slot_name="m", var=new_root.var))) else: # Slot variables are not created eagerly when graph building. with self.assertRaises(KeyError): new_root.optimizer.get_slot(slot_name="m", var=new_root.var) variables = [new_root.var] gradients = [1.] train_op = new_root.optimizer.apply_gradients( zip(gradients, variables)) # The slot variable now exists; restore() didn't create it, but we should # now have a restore op for it. slot_status.run_restore_ops() if not context.executing_eagerly(): # The train op hasn't run when graph building, so the slot variable has # its restored value. It has run in eager, so the value will # be different. self.assertEqual( 14., self.evaluate( new_root.optimizer.get_slot(slot_name="m", var=new_root.var))) self.evaluate(train_op) slot_status.assert_consumed()
def apply_gradients(self, grads_and_vars, worker_id, global_step=None, name=None, collect_cdfs=False): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step._ref()) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) # The wait op waits for the current worker to dequeue a token from its respective token queue self._wait_op = self._sync_token_queues[worker_id].dequeue() # Replicas have to wait until they can get a token from the token queue # BEFORE begining to compute gradients. with ops.device(global_step.device): queue_size = self._sync_token_queues[worker_id].size() update_local_step_op = state_ops.assign(self._local_step, global_step._ref()) # Gradient accum creation with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) tf.logging.info("Grad " + str(grad) + " assigned to " + str(var.device)) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") self._accumulator_list.append((grad_accum, var)) """# Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_grad(grad, local_step=self._local_step._ref())) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()))""" # Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): print_start_op = logging_ops.Print( global_step, [global_step], message="Starting to apply grads for variable %d" % index) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_grad( grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) # Phase 2 gradient applying for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): grad_accum = self._accumulator_list[index][0] if grad is None: aggregated_grad.append(None) elif isinstance(grad, ops.Tensor): if collect_cdfs: aggregated_grad.append( grad_accum.take_grad(self._total_num_replicas)) else: aggregated_grad.append(grad_accum.take_grad(1)) else: if collect_cdfs: aggregated_grad.append( grad_accum.take_grad(self._total_num_replicas)) else: aggregated_grad.append( grad_accum.take_indexed_slices_grad(1)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # Some debug operations self.print_sizes = logging_ops.Print(global_step, [ self._sync_token_queues[i].size() for i in range(self._total_num_replicas) ], message="queue sizes") self.print_accum_sizes = logging_ops.Print( self._local_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Accum sizes") self.print_local_step = logging_ops.Print( self._local_step, [self._local_step._ref(), global_step._ref()], message="local vs global step") # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies([self.print_accum_sizes]): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) self._update_op = update_op with ops.control_dependencies([update_op]): sync_op = [] for cur_worker_id in range(self._total_num_replicas): sync_op.append( self._sync_token_queues[cur_worker_id].enqueue( global_step)) sync_op = control_flow_ops.group(*(sync_op)) # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies(train_ops): # Worker finished applying gradients. Add token to phase1_finished_queue train_op = logging_ops.Print( self._local_step._ref(), [ x[0].num_accumulated() for x in self._accumulator_list ] + [worker_id], message="Finished worker updates", name="FinishedWorkerUpdatesPrint") for accum, var in self._accumulator_list: with ops.device(var.device): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def training_graph(self, input_data, input_labels, random_seed, data_spec, epoch=None, input_weights=None): """Constructs a TF graph for training a random tree. Args: input_data: A tensor or SparseTensor or placeholder for input data. input_labels: A tensor or placeholder for labels associated with input_data. random_seed: The random number generator seed to use for this tree. 0 means use the current time as the seed. data_spec: A list of tf.dtype values specifying the original types of each column. epoch: A tensor or placeholder for the epoch the training data comes from. input_weights: A float tensor or placeholder holding per-input weights, or None if all inputs are to be weighted equally. Returns: The last op in the random tree training graph. """ epoch = [0] if epoch is None else epoch if input_weights is None: input_weights = [] sparse_indices = [] sparse_values = [] sparse_shape = [] if isinstance(input_data, ops.SparseTensor): sparse_indices = input_data.indices sparse_values = input_data.values sparse_shape = input_data.shape input_data = [] # Count extremely random stats. (node_sums, node_squares, splits_indices, splits_sums, splits_squares, totals_indices, totals_sums, totals_squares, input_leaves) = (self.training_ops.count_extremely_random_stats( input_data, sparse_indices, sparse_values, sparse_shape, data_spec, input_labels, input_weights, self.variables.tree, self.variables.tree_thresholds, self.variables.node_to_accumulator_map, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, self.variables.start_epoch, epoch, num_classes=self.params.num_output_columns, regression=self.params.regression)) node_update_ops = [] node_update_ops.append( state_ops.assign_add(self.variables.node_sums, node_sums)) splits_update_ops = [] splits_update_ops.append(self.training_ops.scatter_add_ndim( self.variables.candidate_split_sums, splits_indices, splits_sums)) splits_update_ops.append(self.training_ops.scatter_add_ndim( self.variables.accumulator_sums, totals_indices, totals_sums)) if self.params.regression: node_update_ops.append(state_ops.assign_add(self.variables.node_squares, node_squares)) splits_update_ops.append(self.training_ops.scatter_add_ndim( self.variables.candidate_split_squares, splits_indices, splits_squares)) splits_update_ops.append(self.training_ops.scatter_add_ndim( self.variables.accumulator_squares, totals_indices, totals_squares)) # Sample inputs. update_indices, feature_updates, threshold_updates = ( self.training_ops.sample_inputs( input_data, sparse_indices, sparse_values, sparse_shape, input_weights, self.variables.node_to_accumulator_map, input_leaves, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, split_initializations_per_input=( self.params.split_initializations_per_input), split_sampling_random_seed=random_seed)) update_features_op = state_ops.scatter_update( self.variables.candidate_split_features, update_indices, feature_updates) update_thresholds_op = state_ops.scatter_update( self.variables.candidate_split_thresholds, update_indices, threshold_updates) # Calculate finished nodes. with ops.control_dependencies(splits_update_ops): children = array_ops.squeeze(array_ops.slice( self.variables.tree, [0, 0], [-1, 1]), squeeze_dims=[1]) is_leaf = math_ops.equal(constants.LEAF_NODE, children) leaves = math_ops.to_int32(array_ops.squeeze(array_ops.where(is_leaf), squeeze_dims=[1])) finished, stale = self.training_ops.finished_nodes( leaves, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, self.variables.start_epoch, epoch, num_split_after_samples=self.params.split_after_samples, min_split_samples=self.params.min_split_samples) # Update leaf scores. non_fertile_leaves = array_ops.boolean_mask( leaves, math_ops.less(array_ops.gather( self.variables.node_to_accumulator_map, leaves), 0)) # TODO(gilberth): It should be possible to limit the number of non # fertile leaves we calculate scores for, especially since we can only take # at most array_ops.shape(finished)[0] of them. with ops.control_dependencies(node_update_ops): sums = array_ops.gather(self.variables.node_sums, non_fertile_leaves) if self.params.regression: squares = array_ops.gather(self.variables.node_squares, non_fertile_leaves) non_fertile_leaf_scores = self._variance(sums, squares) else: non_fertile_leaf_scores = self._weighted_gini(sums) # Calculate best splits. with ops.control_dependencies(splits_update_ops): split_indices = self.training_ops.best_splits( finished, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, regression=self.params.regression) # Grow tree. with ops.control_dependencies([update_features_op, update_thresholds_op]): (tree_update_indices, tree_children_updates, tree_threshold_updates, new_eot) = (self.training_ops.grow_tree( self.variables.end_of_tree, self.variables.node_to_accumulator_map, finished, split_indices, self.variables.candidate_split_features, self.variables.candidate_split_thresholds)) tree_update_op = state_ops.scatter_update( self.variables.tree, tree_update_indices, tree_children_updates) thresholds_update_op = state_ops.scatter_update( self.variables.tree_thresholds, tree_update_indices, tree_threshold_updates) # TODO(thomaswc): Only update the epoch on the new leaves. new_epoch_updates = epoch * array_ops.ones_like(tree_threshold_updates, dtype=dtypes.int32) epoch_update_op = state_ops.scatter_update( self.variables.start_epoch, tree_update_indices, new_epoch_updates) # Update fertile slots. with ops.control_dependencies([tree_update_op]): (node_map_updates, accumulators_cleared, accumulators_allocated) = ( self.training_ops.update_fertile_slots( finished, non_fertile_leaves, non_fertile_leaf_scores, self.variables.end_of_tree, self.variables.accumulator_sums, self.variables.node_to_accumulator_map, stale, regression=self.params.regression)) # Ensure end_of_tree doesn't get updated until UpdateFertileSlots has # used it to calculate new leaves. gated_new_eot, = control_flow_ops.tuple([new_eot], control_inputs=[node_map_updates]) eot_update_op = state_ops.assign(self.variables.end_of_tree, gated_new_eot) updates = [] updates.append(eot_update_op) updates.append(tree_update_op) updates.append(thresholds_update_op) updates.append(epoch_update_op) updates.append(state_ops.scatter_update( self.variables.node_to_accumulator_map, array_ops.squeeze(array_ops.slice(node_map_updates, [0, 0], [1, -1]), squeeze_dims=[0]), array_ops.squeeze(array_ops.slice(node_map_updates, [1, 0], [1, -1]), squeeze_dims=[0]))) cleared_and_allocated_accumulators = array_ops.concat( 0, [accumulators_cleared, accumulators_allocated]) # Calculate values to put into scatter update for candidate counts. # Candidate split counts are always reset back to 0 for both cleared # and allocated accumulators. This means some accumulators might be doubly # reset to 0 if the were released and not allocated, then later allocated. split_values = array_ops.tile( array_ops.expand_dims(array_ops.expand_dims( array_ops.zeros_like(cleared_and_allocated_accumulators, dtype=dtypes.float32), 1), 2), [1, self.params.num_splits_to_consider, self.params.num_output_columns]) updates.append(state_ops.scatter_update( self.variables.candidate_split_sums, cleared_and_allocated_accumulators, split_values)) if self.params.regression: updates.append(state_ops.scatter_update( self.variables.candidate_split_squares, cleared_and_allocated_accumulators, split_values)) # Calculate values to put into scatter update for total counts. total_cleared = array_ops.tile( array_ops.expand_dims( math_ops.neg(array_ops.ones_like(accumulators_cleared, dtype=dtypes.float32)), 1), [1, self.params.num_output_columns]) total_reset = array_ops.tile( array_ops.expand_dims( array_ops.zeros_like(accumulators_allocated, dtype=dtypes.float32), 1), [1, self.params.num_output_columns]) accumulator_updates = array_ops.concat(0, [total_cleared, total_reset]) updates.append(state_ops.scatter_update( self.variables.accumulator_sums, cleared_and_allocated_accumulators, accumulator_updates)) if self.params.regression: updates.append(state_ops.scatter_update( self.variables.accumulator_squares, cleared_and_allocated_accumulators, accumulator_updates)) # Calculate values to put into scatter update for candidate splits. split_features_updates = array_ops.tile( array_ops.expand_dims( math_ops.neg(array_ops.ones_like( cleared_and_allocated_accumulators)), 1), [1, self.params.num_splits_to_consider]) updates.append(state_ops.scatter_update( self.variables.candidate_split_features, cleared_and_allocated_accumulators, split_features_updates)) updates += self.finish_iteration() return control_flow_ops.group(*updates)
def LastValueQuantize(inputs, per_channel=False, init_min=-6.0, init_max=6.0, updates_collection=ops.GraphKeys.UPDATE_OPS, vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES, name_prefix='LastValueQuant', reuse=None, is_training=True, num_bits=8, narrow_range=False): """Adds a layer that collects quantization ranges as last input ranges. LastValueQuantize creates variables called 'min' and 'max', representing the interval used for quantization and clamping. Args: inputs: a tensor containing values to be quantized. per_channel: (Optional) a boolean specifying whether to use different quantization ranges per output channel. init_min: a float scalar, the initial value for variable min. init_max: a float scalar, the initial value for variable max. updates_collection: (Optional) collections to collect the update ops for computation. vars_collection: (Optional) collection where to store variables for quantization interval ends. name_prefix: name_prefix for created nodes. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. is_training: Whether the op is applied to a training or eval graph. num_bits: Number of bits to use for quantization, must be between 2 and 8. narrow_range: Whether to use the narrow quantization range [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1]. Returns: a tensor containing quantized values. """ with variable_scope.variable_scope( None, default_name=name_prefix, values=[inputs], reuse=reuse): input_shape = inputs.get_shape() input_dim = len(input_shape) if per_channel: # Only support quantizing 1-, 2- and 4-dimensional tensors. assert input_dim in [1, 2, 4], ('Expected 1D, 2D or 4D input, was: %s in ' ' scope: %s' % (input_shape, name_prefix)) min_max_shape = [input_shape[-1]] else: min_max_shape = [] min_var = model_variable( 'min', shape=min_max_shape, initializer=init_ops.constant_initializer(init_min), collections=[vars_collection], trainable=False) max_var = model_variable( 'max', shape=min_max_shape, initializer=init_ops.constant_initializer(init_max), collections=[vars_collection], trainable=False) if not is_training: return _FakeQuantWithMinMaxVars( inputs, min_var, max_var, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range) if per_channel: if input_dim == 2: reduce_dims = [0] elif input_dim == 4: reduce_dims = [0, 1, 2] if per_channel: if input_dim >= 2: batch_min = math_ops.reduce_min( inputs, reduction_indices=reduce_dims, name='BatchMin') else: batch_min = inputs else: batch_min = math_ops.reduce_min(inputs, name='BatchMin') # TFLite requires that 0.0 if always in the [min; max] range. batch_min = math_ops.minimum(batch_min, 0.0) assign_min = state_ops.assign(min_var, batch_min, name='AssignMinLast') ops.add_to_collection(updates_collection, assign_min.op) if per_channel: if input_dim >= 2: batch_max = math_ops.reduce_max( inputs, reduction_indices=reduce_dims, name='BatchMax') else: batch_max = inputs else: batch_max = math_ops.reduce_max(inputs, name='BatchMax') # TFLite requires that 0.0 if always in the [min; max] range. batch_max = math_ops.maximum(batch_max, 0.0) assign_max = state_ops.assign(max_var, batch_max, name='AssignMaxLast') ops.add_to_collection(updates_collection, assign_max.op) return _FakeQuantWithMinMaxVars( inputs, assign_min, assign_max, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range)
def testSaveRestore(self): with self.test_session(): model = MyModel() optimizer = adam.AdamOptimizer(0.001) root_trackable = trackable_utils.Checkpoint(optimizer=optimizer, model=model) input_value = constant_op.constant([[3.]]) if context.executing_eagerly(): optimizer.minimize(lambda: model(input_value)) else: train_op = optimizer.minimize(model(input_value)) # TODO(allenl): Make initialization more pleasant when graph building. root_trackable.save_counter # pylint: disable=pointless-statement self.evaluate( trackable_utils.gather_initializers(root_trackable)) self.evaluate(train_op) prefix = os.path.join(self.get_temp_dir(), "ckpt") self.evaluate( state_ops.assign(model._named_dense.variables[1], [42.])) m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m") self.evaluate(state_ops.assign(m_bias_slot, [1.5])) save_path = root_trackable.save(file_prefix=prefix) self.evaluate( state_ops.assign(model._named_dense.variables[1], [43.])) self.evaluate(state_ops.assign(root_trackable.save_counter, 3)) optimizer_variables = self.evaluate(optimizer.variables()) self.evaluate(state_ops.assign(m_bias_slot, [-2.])) # Immediate restoration status = root_trackable.restore( save_path=save_path).assert_consumed() status.run_restore_ops() self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1])) self.assertAllEqual(1, self.evaluate(root_trackable.save_counter)) self.assertAllEqual([1.5], self.evaluate(m_bias_slot)) if not context.executing_eagerly(): return # Restore-on-create is only supported when executing eagerly on_create_model = MyModel() on_create_optimizer = adam.AdamOptimizer( 0.001, # Preserve beta1_power and beta2_power when applying gradients # so we can test that they've been restored correctly. beta1=1.0, beta2=1.0) on_create_root = trackable_utils.Checkpoint( optimizer=on_create_optimizer, model=on_create_model) # Deferred restoration status = on_create_root.restore(save_path=save_path) status.assert_nontrivial_match() status.assert_existing_objects_matched() with self.assertRaises(AssertionError): status.assert_consumed() on_create_model(constant_op.constant([[3.]])) # create variables self.assertAllEqual(1, self.evaluate(on_create_root.save_counter)) self.assertAllEqual([42.], self.evaluate( on_create_model._named_dense.variables[1])) on_create_m_bias_slot = on_create_optimizer.get_slot( on_create_model._named_dense.variables[1], "m") status.assert_existing_objects_matched() with self.assertRaises(AssertionError): status.assert_consumed() # Optimizer slot variables are created when the original variable is # restored. self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot)) self.assertAllEqual(optimizer_variables[2:], self.evaluate(on_create_optimizer.variables())) dummy_var = variables.Variable([1.]) on_create_optimizer.minimize(loss=dummy_var.read_value) status.assert_existing_objects_matched() status.assert_consumed() beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators( ) self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power)) self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
def testSaveRestore(self): with self.test_session(): model = MyModel() optimizer = adam.Adam(0.001) root_trackable = trackable_utils.Checkpoint(optimizer=optimizer, model=model) input_value = constant_op.constant([[3.]]) with backprop.GradientTape() as tape: loss = model(input_value) variables = model.trainable_variables gradients = tape.gradient(loss, variables) train_op = optimizer.apply_gradients(zip(gradients, variables)) self.assertFalse(root_trackable.save_counter.trainable) self.evaluate(trackable_utils.gather_initializers(root_trackable)) self.evaluate(train_op) prefix = os.path.join(self.get_temp_dir(), "ckpt") self.evaluate( state_ops.assign(model._named_dense.variables[1], [42.])) m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m") self.evaluate(state_ops.assign(m_bias_slot, [1.5])) save_path = root_trackable.save(file_prefix=prefix) self.evaluate( state_ops.assign(model._named_dense.variables[1], [43.])) self.evaluate(state_ops.assign(root_trackable.save_counter, 3)) optimizer_variables = self.evaluate( sorted(optimizer.variables(), key=lambda v: v.name)) self.evaluate(state_ops.assign(m_bias_slot, [-2.])) # Immediate restoration status = root_trackable.restore( save_path=save_path).assert_consumed() status.run_restore_ops() self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1])) self.assertAllEqual(1, self.evaluate(root_trackable.save_counter)) self.assertAllEqual([1.5], self.evaluate(m_bias_slot)) if not context.executing_eagerly(): return # Restore-on-create is only supported when executing eagerly on_create_model = MyModel() on_create_optimizer = adam.Adam(0.001) on_create_root = trackable_utils.Checkpoint( optimizer=on_create_optimizer, model=on_create_model) # Deferred restoration status = on_create_root.restore(save_path=save_path) status.assert_nontrivial_match() status.assert_existing_objects_matched() with self.assertRaises(AssertionError): status.assert_consumed() on_create_model(constant_op.constant([[3.]])) # create variables self.assertAllEqual(1, self.evaluate(on_create_root.save_counter)) self.assertAllEqual([42.], self.evaluate( on_create_model._named_dense.variables[1])) on_create_m_bias_slot = on_create_optimizer.get_slot( on_create_model._named_dense.variables[1], "m") status.assert_existing_objects_matched() if not context.executing_eagerly(): with self.assertRaises(AssertionError): status.assert_consumed() # Optimizer slot variables are created when the original variable is # restored. self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot)) dummy_var = resource_variable_ops.ResourceVariable([1.]) on_create_optimizer.minimize(loss=dummy_var.read_value, var_list=[dummy_var]) status.assert_existing_objects_matched() status.assert_consumed() self.assertAllEqual( optimizer_variables, # Creation order is different, so .variables() needs to be re-sorted. self.evaluate( sorted(optimizer.variables(), key=lambda v: v.name)))
def _update_statistics_from_mini_batch(self, statistics, auxiliary_variables, times, values): """Given mini-batch input, update `statistics` and `auxiliary_variables`.""" values = math_ops.cast(values, self._dtype) # The density (measured in times per observation) that we see in each part # of the mini-batch. batch_inter_observation_duration = ( math_ops.cast( math_ops.reduce_max(times, axis=1) - math_ops.reduce_min(times, axis=1), self._dtype) / math_ops.cast(array_ops.shape(times)[1] - 1, self._dtype)) # Co-locate updates with their variables to minimize race conditions when # updating statistics. with ops.colocate_with(auxiliary_variables.max_time_seen): # There is a race condition if this value is being updated from multiple # workers. However, it should eventually reach the correct value if the # last chunk is presented enough times. max_time_seen_assign = state_ops.assign( auxiliary_variables.max_time_seen, gen_math_ops.maximum(auxiliary_variables.max_time_seen, math_ops.reduce_max(times))) with ops.colocate_with(auxiliary_variables.chunk_count): chunk_count_assign = state_ops.assign_add( auxiliary_variables.chunk_count, array_ops.shape(times, out_type=dtypes.int64)[0]) with ops.colocate_with( auxiliary_variables.inter_observation_duration_sum): inter_observation_duration_assign = state_ops.assign_add( auxiliary_variables.inter_observation_duration_sum, math_ops.reduce_sum(batch_inter_observation_duration)) with ops.colocate_with(auxiliary_variables.example_count): example_count_assign = state_ops.assign_add( auxiliary_variables.example_count, array_ops.size(times, out_type=dtypes.int64)) # Note: These mean/variance updates assume that all points are equally # likely, which is not true if _chunks_ are sampled uniformly from the space # of all possible contiguous chunks, since points at the start and end of # the series are then members of fewer chunks. For series which are much # longer than the chunk size (the usual/expected case), this effect becomes # irrelevant. with ops.colocate_with(auxiliary_variables.overall_feature_sum): overall_feature_sum_assign = state_ops.assign_add( auxiliary_variables.overall_feature_sum, math_ops.reduce_sum(values, axis=[0, 1])) with ops.colocate_with( auxiliary_variables.overall_feature_sum_of_squares): overall_feature_sum_of_squares_assign = state_ops.assign_add( auxiliary_variables.overall_feature_sum_of_squares, math_ops.reduce_sum(values**2, axis=[0, 1])) per_chunk_aux_updates = control_flow_ops.group( max_time_seen_assign, chunk_count_assign, inter_observation_duration_assign, example_count_assign, overall_feature_sum_assign, overall_feature_sum_of_squares_assign) with ops.control_dependencies([per_chunk_aux_updates]): example_count_float = math_ops.cast( auxiliary_variables.example_count, self._dtype) new_feature_mean = (auxiliary_variables.overall_feature_sum / example_count_float) overall_feature_mean_update = state_ops.assign( statistics.overall_feature_moments.mean, new_feature_mean) overall_feature_var_update = state_ops.assign( statistics.overall_feature_moments.variance, # De-biased n / (n - 1) variance correction example_count_float / (example_count_float - 1.) * (auxiliary_variables.overall_feature_sum_of_squares / example_count_float - new_feature_mean**2)) # TODO(b/35675805): Remove this cast min_time_batch = math_ops.cast(math_ops.argmin(times[:, 0]), dtypes.int32) def series_start_updates(): # If this is the lowest-time chunk that we have seen so far, update # series start moments to reflect that. Note that these statistics are # "best effort", as there are race conditions in the update (however, # they should eventually converge if the start of the series is # presented enough times). mean, variance = nn.moments(values[ min_time_batch, :self._starting_variance_window_size], axes=[0]) return control_flow_ops.group( state_ops.assign(statistics.series_start_moments.mean, mean), state_ops.assign(statistics.series_start_moments.variance, variance)) with ops.colocate_with(statistics.start_time): series_start_update = control_flow_ops.cond( # Update moments whenever we even match the lowest time seen so far, # to ensure that series start statistics are eventually updated to # their correct values, despite race conditions (i.e. eventually # statistics.start_time will reflect the global lowest time, and # given that we will eventually update the series start moments to # their correct values). math_ops.less_equal(times[min_time_batch, 0], statistics.start_time), series_start_updates, control_flow_ops.no_op) with ops.control_dependencies([series_start_update]): # There is a race condition if this update is performed in parallel on # multiple workers. Since models may be sensitive to being presented # with times before the putative start time, the value of this # variable is post-processed above to guarantee that each worker is # presented with a start time which is at least as low as the lowest # time in its current mini-batch. start_time_update = state_ops.assign( statistics.start_time, gen_math_ops.minimum(statistics.start_time, math_ops.reduce_min(times))) inter_observation_duration_estimate = ( auxiliary_variables.inter_observation_duration_sum / math_ops.cast(auxiliary_variables.chunk_count, self._dtype)) # Estimate the total number of observations as: # (end time - start time + 1) * average intra-chunk time density total_observation_count_update = state_ops.assign( statistics.total_observation_count, math_ops.cast( gen_math_ops.round( math_ops.cast( max_time_seen_assign - start_time_update + 1, self._dtype) / inter_observation_duration_estimate), dtypes.int64)) per_chunk_stat_updates = control_flow_ops.group( overall_feature_mean_update, overall_feature_var_update, series_start_update, start_time_update, total_observation_count_update) return per_chunk_stat_updates
def _initialized_limit_check(): return control_flow_ops.cond( math_ops.reduce_any(non_decreasing), lambda: state_ops.assign(discarded_windows_limiter, 0), lambda: discarded_windows_limiter.count_up_to( self._discard_limit))
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) m = self.get_slot(var, "m") v = self.get_slot(var, "v") g = self.get_slot(var, "g") z = self.get_slot(var, "z") b1p = self.get_slot(var, "b1p") b2p = self.get_slot(var, "b2p") m_t = state_ops.assign(m, beta1_t * m + grad * (1 - beta1_t), use_locking=self._use_locking) if self._pred_g_op == 'none': v_t = state_ops.assign(v, v * beta2_t + tf.square(g) * (1 - beta2_t), use_locking=self._use_locking) elif self._pred_g_op == 'max': v_t = state_ops.assign(v, v * beta2_t + tf.reduce_max(tf.square(g)) * (1 - beta2_t), use_locking=self._use_locking) elif self._pred_g_op == 'mean': v_t = state_ops.assign(v, v * beta2_t + tf.reduce_mean(tf.square(g)) * (1 - beta2_t), use_locking=self._use_locking) else: assert False # v_t = tf.cond(tf.less(self._current_iter, tf.constant(self._init_step)), # lambda: state_ops.assign(v, v * beta2_t + (grad * grad) * (1 - beta2_t), use_locking=self._use_locking), # lambda: state_ops.assign(v, v * beta2_t + (g * g) * (1 - beta2_t), use_locking=self._use_locking)) # cond = (tf.sign(tf.cast(self._current_iter - tf.constant(self._init_step), tf.float32) + tf.constant(0.5)) + tf.constant(1.0)) / tf.constant(2.0) # v_a = v * beta2_t + (grad * grad) * (1 - beta2_t) # v_b = v * beta2_t + (g * g) * (1 - beta2_t) # v_t = state_ops.assign(v, v_a * (1 - cond) + v_b * cond, use_locking=self._use_locking) # cond = tf.abs(tf.sign(g)) # v_t = state_ops.assign(v, v * (1 - cond) + (v * beta2_t + (g * g) * (1 - beta2_t)) * cond, use_locking=self._use_locking) # v_t = state_ops.assign(v, v * beta2_t + (g * g) * (1 - beta2_t), use_locking=self._use_locking) # v_t = state_ops.assign(v, tf.maximum(grad * grad * beta2_fix, v * beta2_t + (g * g) * (1 - beta2_t)), use_locking=self._use_locking) with ops.control_dependencies([v_t]): z_t = state_ops.assign(z, tf.cast(tf.logical_or(v_t > 0.0, z > 0.0), tf.float32)) g_t = state_ops.assign(g, grad, use_locking=self._use_locking) b1p_t = state_ops.assign(b1p, b1p * beta1_t * tf.sign(z_t) + (1.0 - tf.sign(z_t)), use_locking=self._use_locking) b2p_t = state_ops.assign(b2p, b2p * beta2_t * tf.sign(z_t) + (1.0 - tf.sign(z_t)), use_locking=self._use_locking) b1_fix = tf.maximum(1e-8, 1.0 - b1p_t) b2_fix = tf.maximum(1e-8, 1.0 - b2p_t) step_t = z_t * (m_t / b1_fix) / (math_ops.sqrt(v_t / b2_fix) + epsilon_t) # if var.name == self.first_var.name: #'discriminator/final_linear/w:0': # idx = 0 # step_t = tf.Print(step_t, [z_t[idx]], 'z_t', summarize=1000) # step_t = tf.Print(step_t, [g[idx]], 'g', summarize=1000) # step_t = tf.Print(step_t, [grad[idx]], 'grad', summarize=1000) # step_t = tf.Print(step_t, [b2p_t[idx]], 'b2p_t', summarize=1000) # step_t = tf.Print(step_t, [b2_fix], 'beta2_fix', summarize=1000) # step_t = tf.Print(step_t, [tf.sqrt(v_t / b2_fix)[idx]], 'v_t', summarize=1000) # step_t = tf.Print(step_t, [step_t], 'step', summarize=1000) var_update = state_ops.assign_sub(var, lr_t * step_t, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, g_t])
def set_model(self, model): """Sets Keras model and creates summary ops.""" self.model = model self._init_writer(model) # histogram summaries only enabled in graph mode if not context.executing_eagerly(): self._make_histogram_ops(model) self.merged = tf_summary.merge_all() # If both embedding_freq and embeddings_data are available, we will # visualize embeddings. if self.embeddings_freq and self.embeddings_data is not None: # Avoid circular dependency. from tensorflow.python.keras.engine import training_utils_v1 # pylint: disable=g-import-not-at-top self.embeddings_data = training_utils_v1.standardize_input_data( self.embeddings_data, model.input_names) # If embedding_layer_names are not provided, get all of the embedding # layers from the model. embeddings_layer_names = self.embeddings_layer_names if not embeddings_layer_names: embeddings_layer_names = [ layer.name for layer in self.model.layers if type(layer).__name__ == 'Embedding' ] self.assign_embeddings = [] embeddings_vars = {} self.batch_id = batch_id = array_ops.placeholder(dtypes.int32) self.step = step = array_ops.placeholder(dtypes.int32) for layer in self.model.layers: if layer.name in embeddings_layer_names: embedding_input = self.model.get_layer(layer.name).output embedding_size = np.prod(embedding_input.shape[1:]) embedding_input = array_ops.reshape( embedding_input, (step, int(embedding_size))) shape = (self.embeddings_data[0].shape[0], int(embedding_size)) embedding = variables.Variable(array_ops.zeros(shape), name=layer.name + '_embedding') embeddings_vars[layer.name] = embedding batch = state_ops.assign( embedding[batch_id:batch_id + step], embedding_input) self.assign_embeddings.append(batch) self.saver = saver.Saver(list(embeddings_vars.values())) # Create embeddings_metadata dictionary if isinstance(self.embeddings_metadata, str): embeddings_metadata = { layer_name: self.embeddings_metadata for layer_name in embeddings_vars.keys() } else: # If embedding_metadata is already a dictionary embeddings_metadata = self.embeddings_metadata try: from tensorboard.plugins import projector except ImportError: raise ImportError( 'Failed to import TensorBoard. Please make sure that ' 'TensorBoard integration is complete."') # TODO(psv): Add integration tests to test embedding visualization # with TensorBoard callback. We are unable to write a unit test for this # because TensorBoard dependency assumes TensorFlow package is installed. config = projector.ProjectorConfig() for layer_name, tensor in embeddings_vars.items(): embedding = config.embeddings.add() embedding.tensor_name = tensor.name if (embeddings_metadata is not None and layer_name in embeddings_metadata): embedding.metadata_path = embeddings_metadata[layer_name] projector.visualize_embeddings(self.writer, config)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None): var_dtype = var.dtype.base_dtype lr_t = array_ops.identity(self._get_hyper('learning_rate', var_dtype)) beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype)) epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') local_step = math_ops.cast(self.iterations + 1, var_dtype) next_step = math_ops.cast(self.iterations + 2, var_dtype) decay_base = math_ops.cast(0.96, var_dtype) # Learning rate multipliers if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t, var) momentum_cache_t = beta_1_t * ( 1. - 0.5 * (math_ops.pow(decay_base, self._initial_decay * local_step))) momentum_cache_t_1 = beta_1_t * ( 1. - 0.5 * (math_ops.pow(decay_base, self._initial_decay * next_step))) m_schedule_new = math_ops.cast(self._m_cache_read, var_dtype) * momentum_cache_t if var_dtype is self._m_cache.dtype: m_schedule_new = array_ops.identity( state_ops.assign(self._m_cache, m_schedule_new, use_locking=self._use_locking)) m_schedule_next = m_schedule_new * momentum_cache_t_1 m_scaled_g_values = grad * (1. - beta_1_t) m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) m_t_slice = array_ops.gather(m_t, indices) m_t_prime = m_t_slice / (1. - m_schedule_next) g_prime = grad / (1. - m_schedule_new) m_t_bar = (1. - momentum_cache_t) * g_prime + (momentum_cache_t_1 * m_t_prime) v_scaled_g_values = (grad * grad) * (1. - beta_2_t) v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) v_t_slice = array_ops.gather(v_t, indices) v_t_prime_denominator = 1. - math_ops.pow(beta_2_t, local_step) v_t_prime = v_t_slice / v_t_prime_denominator v_prime_sqrt_plus_eps = math_ops.sqrt(v_t_prime) + epsilon_t var_t = self._resource_scatter_add( var, indices, -self.eta_t * lr_t * m_t_bar / v_prime_sqrt_plus_eps) # Weight decays if var.name in self.weight_decays.keys(): var_t = _apply_weight_decays(self, var, var_t) var_update = state_ops.assign(var, var_t, use_locking=self._use_locking) # Cosine annealing (iteration_done, t_cur_update, eta_t_update) = _update_t_cur_eta_t_v2(self, lr_t, var) if iteration_done and not self._init_notified: self._init_notified = True updates = [var_update, m_t_bar, v_t] if iteration_done: updates += [t_cur_update] if self.use_cosine_annealing and iteration_done: updates += [eta_t_update] return control_flow_ops.group(*updates)
def streaming_precision_recall_arrays(n_gbboxes, rclasses, rscores, tp_tensor, fp_tensor, remove_zero_labels=True, metrics_collections=None, updates_collections=None, name=None): """Streaming computation of precision / recall arrays. This metrics keeps tracks of boolean True positives and False positives arrays. """ with variable_scope.variable_scope( name, 'stream_precision_recall', [n_gbboxes, rclasses, tp_tensor, fp_tensor]): n_gbboxes = math_ops.to_int64(n_gbboxes) rclasses = math_ops.to_int64(rclasses) rscores = math_ops.to_float(rscores) stype = tf.int32 tp_tensor = tf.cast(tp_tensor, stype) fp_tensor = tf.cast(fp_tensor, stype) # Reshape TP and FP tensors and clean away 0 class values. rclasses = tf.reshape(rclasses, [-1]) rscores = tf.reshape(rscores, [-1]) tp_tensor = tf.reshape(tp_tensor, [-1]) fp_tensor = tf.reshape(fp_tensor, [-1]) if remove_zero_labels: mask = tf.greater(rclasses, 0) rclasses = tf.boolean_mask(rclasses, mask) rscores = tf.boolean_mask(rscores, mask) tp_tensor = tf.boolean_mask(tp_tensor, mask) fp_tensor = tf.boolean_mask(fp_tensor, mask) # Local variables accumlating information over batches. v_nobjects = _create_local('v_nobjects', shape=[], dtype=tf.int64) v_ndetections = _create_local('v_ndetections', shape=[], dtype=tf.int32) v_scores = _create_local('v_scores', shape=[ 0, ]) v_tp = _create_local('v_tp', shape=[ 0, ], dtype=stype) v_fp = _create_local('v_fp', shape=[ 0, ], dtype=stype) # Update operations. nobjects_op = state_ops.assign_add(v_nobjects, tf.reduce_sum(n_gbboxes)) ndetections_op = state_ops.assign_add( v_ndetections, tf.size(rscores, out_type=tf.int32)) scores_op = state_ops.assign(v_scores, tf.concat([v_scores, rscores], axis=0), validate_shape=False) tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp_tensor], axis=0), validate_shape=False) fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp_tensor], axis=0), validate_shape=False) # Precision and recall computations. # r = _precision_recall(nobjects_op, scores_op, tp_op, fp_op, 'value') r = _precision_recall(v_nobjects, v_ndetections, v_scores, v_tp, v_fp, 'value') with ops.control_dependencies( [nobjects_op, ndetections_op, scores_op, tp_op, fp_op]): update_op = _precision_recall(nobjects_op, ndetections_op, scores_op, tp_op, fp_op, 'update_op') # update_op = tf.Print(update_op, # [tf.reduce_sum(tf.cast(mask, tf.int64)), # tf.reduce_sum(tf.cast(mask2, tf.int64)), # tf.reduce_min(rscores), # tf.reduce_sum(n_gbboxes)], # 'Metric: ') # Some debugging stuff! # update_op = tf.Print(update_op, # [tf.shape(tp_op), # tf.reduce_sum(tf.cast(tp_op, tf.int64), axis=0)], # 'TP and FP shape: ') # update_op[0] = tf.Print(update_op, # [nobjects_op], # '# Groundtruth bboxes: ') # update_op = tf.Print(update_op, # [update_op[0][0], # update_op[0][-1], # tf.reduce_min(update_op[0]), # tf.reduce_max(update_op[0]), # tf.reduce_min(update_op[1]), # tf.reduce_max(update_op[1])], # 'Precision and recall :') if metrics_collections: ops.add_to_collections(metrics_collections, r) if updates_collections: ops.add_to_collections(updates_collections, update_op) return r, update_op
def test_save_restore_multi_variables(self, rnn_mode, num_units, input_size, num_layers, direction): # Verify the restored opaque param, once converted to tf_canonical format, # is the same as the tf canonicals of the pre-restored param. if not context.context().num_gpus(): self.skipTest("No GPUs found") with self.session(use_gpu=True) as sess: opaque_params = [] saveables = [] num_opaque_params = 2 for i in range(num_opaque_params): opaque_params.append( self._create_opaque_param( rnn_mode, num_units, input_size, num_layers, direction, name="opaque_param_%d" % i)) saveable = self._create_saveable(opaque_params[i], rnn_mode, num_units, input_size, num_layers, direction) ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable) saveables.append(saveable) weights_ops, biases_ops = [], [] for i in range(num_opaque_params): weights_op, biases_op = ( saveables[i].format_converter.opaque_to_tf_canonical( saveables[i]._variables)) weights_ops.append(weights_op) biases_ops.append(biases_op) save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) init_op = variables.global_variables_initializer() reset_ops = [] for i in range(num_opaque_params): reset_ops.append( state_ops.assign(opaque_params[i], array_ops.zeros_like(opaque_params[i]))) sess.run(init_op) self.assertEqual(save_path, saver.save(sess, save_path)) # Get the tf canonical vals before reset-restore for i in range(num_opaque_params): weights, biases = sess.run([weights_ops[i], biases_ops[i]]) # Reset the opaque param value sess.run(reset_ops[i]) # Assert reset happened. weights_z, biases_z = sess.run([weights_ops[i], biases_ops[i]]) for w in weights_z: self.assertAllClose(w, np.zeros_like(w)) for b in biases_z: self.assertAllClose(b, np.zeros_like(b)) # Restore opaque param value from checkpoint. saver.restore(sess, save_path) weights_r, biases_r = sess.run([weights_ops[i], biases_ops[i]]) self._compare_weights(weights, weights_r) self._compare_biases(biases, biases_r)