def applyOptimizer(self, opt, dtype, steps=5, is_sparse=False): if is_sparse: var0 = tf.Variable([[0.0], [0.0]], dtype=dtype) var1 = tf.Variable([[0.0], [0.0]], dtype=dtype) grads0 = tf.IndexedSlices( tf.constant([0.1], shape=[1, 1], dtype=dtype), tf.constant([0]), tf.constant([2, 1])) grads1 = tf.IndexedSlices( tf.constant([0.02], shape=[1, 1], dtype=dtype), tf.constant([1]), tf.constant([2, 1])) else: var0 = tf.Variable([0.0, 0.0], dtype=dtype) var1 = tf.Variable([0.0, 0.0], dtype=dtype) grads0 = tf.constant([0.1, 0.2], dtype=dtype) grads1 = tf.constant([0.01, 0.02], dtype=dtype) update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) v0_val, v1_val = self.evaluate([var0, var1]) if is_sparse: self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val) self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val) else: self.assertAllCloseAccordingToType([0.0, 0.0], v0_val) self.assertAllCloseAccordingToType([0.0, 0.0], v1_val) # Run Ftrl for a few steps for _ in range(steps): update.run() v0_val, v1_val = self.evaluate([var0, var1]) return v0_val, v1_val
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" if self._var_key(variable) not in self._index_dict: raise KeyError( f'Optimizer cannot recognize variable {variable.name}, ' f'this usually means you are calling an optimizer ' f'previously used on a different model. Please try ' f'creating a new optimizer instance.') lr = tf.cast(self.learning_rate, variable.dtype) var_key = self._var_key(variable) velocity = self._velocities[self._index_dict[var_key]] momentum = None if self.momentum > 0: momentum = self._momentums[self._index_dict[var_key]] average_grad = None if self.centered: average_grad = self._average_gradients[self._index_dict[var_key]] rho = self.rho if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. velocity.assign(rho * velocity) velocity.scatter_add( tf.IndexedSlices( tf.square(gradient.values) * (1 - rho), gradient.indices)) if self.centered: average_grad.assign(rho * average_grad) average_grad.scatter_add( tf.IndexedSlices( tf.square(gradient.values) * (1 - rho), gradient.indices)) velocity.assign_add(-tf.square(average_grad)) velocity_value = tf.gather(velocity, gradient.indices) transformed_grad = tf.IndexedSlices( gradient.values / (tf.sqrt(velocity_value) + self.epsilon), gradient.indices) if self.momentum > 0: momentum.assign(self.momentum * momentum) momentum.scatter_add(transformed_grad) variable.assign_add(-lr * momentum) else: variable.scatter_add( tf.IndexedSlices(-lr * transformed_grad.values, transformed_grad.indices)) else: # Dense gradients. velocity.assign(rho * velocity + (1 - rho) * tf.square(gradient)) if self.centered: average_grad.assign(rho * average_grad + (1 - rho) * tf.square(gradient)) velocity.assign_add(-tf.square(average_grad)) transformed_grad = gradient / (tf.sqrt(velocity) + self.epsilon) if self.momentum > 0: momentum.assign(self.momentum * momentum + transformed_grad) variable.assign_add(-lr * momentum) else: variable.assign_add(-lr * transformed_grad)
def testSparseBasic(self): if tf.executing_eagerly(): return for dtype in [tf.half, tf.float32, tf.float64]: with self.cached_session(): var0 = tf.Variable([[1.1], [2.1]], dtype=dtype) var1 = tf.Variable([[3.], [4.]], dtype=dtype) grads0 = tf.IndexedSlices( tf.constant([0.1], shape=[1, 1], dtype=dtype), tf.constant([0]), tf.constant([2, 1])) grads1 = tf.IndexedSlices( tf.constant([0.01], shape=[1, 1], dtype=dtype), tf.constant([1]), tf.constant([2, 1])) decay_rate = 0.9 sgd_op = tfp.optimizer.StochasticGradientLangevinDynamics( 3., preconditioner_decay_rate=decay_rate).apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf1.global_variables_initializer()) # Fetch params to validate initial values self.assertAllCloseAccordingToType([[1.1], [2.1]], self.evaluate(var0)) self.assertAllCloseAccordingToType([[3.], [4.]], self.evaluate(var1)) # Run 1 step of sgd self.evaluate(sgd_op) # Validate updated params grads_scaled = (0.5 * 0.1 / np.sqrt(decay_rate + (1. - decay_rate) * 0.1**2 + 1e-8)) # Note that `tfp.math.diag_jacobian(xs=var, ys=grad)` returns zero # tensor self.assertAllCloseAccordingToType([[1.1 - 3. * grads_scaled], [2.1]], self.evaluate(var0)) grads_scaled = (0.5 * 0.01 / np.sqrt( decay_rate + (1. - decay_rate) * 0.01**2 + 1e-8)) self.assertAllCloseAccordingToType( [[3. - 3. * 0], [4. - 3. * grads_scaled]], self.evaluate(var1))
def testSparseBasicWithLearningRateDecay(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. with tf.Graph().as_default(): for dtype in [tf.half, tf.float32, tf.float64]: var0 = tf.Variable([[1.0], [2.0]], dtype=dtype) var1 = tf.Variable([[3.0], [4.0]], dtype=dtype) grads0 = tf.IndexedSlices( tf.constant([0.1], shape=[1, 1], dtype=dtype), tf.constant([0]), tf.constant([2, 1])) grads1 = tf.IndexedSlices( tf.constant([0.01], shape=[1, 1], dtype=dtype), tf.constant([1]), tf.constant([2, 1])) sgd_op = gradient_descent.SGD(3.0, decay=0.5).apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) # Run 2 steps of sgd self.evaluate(sgd_op) # Validate updated params self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]], self.evaluate(var0)) self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]], self.evaluate(var1)) self.evaluate(sgd_op) # Validate updated params self.assertAllCloseAccordingToType( [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]], self.evaluate(var0)) self.assertAllCloseAccordingToType( [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]], self.evaluate(var1))
def testSparseRepeatedIndices(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. with tf.Graph().as_default(): for dtype in _DATA_TYPES: var_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype) repeated_index_update_var = tf.Variable(var_np, dtype=dtype) aggregated_update_var = tf.Variable(var_np, dtype=dtype) grad_repeated_index = tf.IndexedSlices( tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype), tf.constant([1, 1]), tf.constant([2, 1])) grad_aggregated = tf.IndexedSlices( tf.constant([0.2], shape=[1, 1], dtype=dtype), tf.constant([1]), tf.constant([2, 1])) repeated_update = adagrad.Adagrad(3.0).apply_gradients([ (grad_repeated_index, repeated_index_update_var) ]) aggregated_update = adagrad.Adagrad(3.0).apply_gradients([ (grad_aggregated, aggregated_update_var) ]) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(aggregated_update_var), self.evaluate(repeated_index_update_var)) for _ in range(3): self.evaluate(repeated_update) self.evaluate(aggregated_update) self.assertAllClose( self.evaluate(aggregated_update_var), self.evaluate(repeated_index_update_var))
def _resource_apply_sparse(self, grad, var, indices, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, 'm') m_scaled_g_values = grad * coefficients['one_minus_beta_1_t'] m.assign(m * coefficients['beta_1_t']) m.scatter_add(tf.IndexedSlices(m_scaled_g_values, indices)) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, 'v') v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t'] v.assign(v * coefficients['beta_2_t']) v.scatter_add(tf.IndexedSlices(v_scaled_g_values, indices)) if not self.amsgrad: var.assign_sub(coefficients['lr'] * m / (tf.sqrt(v) + coefficients['epsilon'])) else: v_hat = self.get_slot(var, 'vhat') v_hat.assign(tf.maximum(v_hat, v)) var.assign_sub(coefficients['lr'] * m / (tf.sqrt(v_hat) + coefficients['epsilon']))
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" if self._var_key(variable) not in self._index_dict: raise KeyError( f'Optimizer cannot recognize variable {variable.name}, ' f'this usually means you are calling an optimizer ' f'previously used on a different model. Please try ' f'creating a new optimizer instance.') lr = tf.cast(self.learning_rate, variable.dtype) local_step = tf.cast(self.iterations + 1, variable.dtype) beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step) var_key = self._var_key(variable) m = self._m[self._index_dict[var_key]] u = self._u[self._index_dict[var_key]] if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. indices = gradient.indices m.assign_add(-m * (1 - self.beta_1)) m.scatter_add( tf.IndexedSlices(gradient.values * (1 - self.beta_1), indices)) u.assign(u * self.beta_2) u_slice = tf.gather(u, indices) u_slice_incremental = tf.maximum(u_slice, tf.abs( gradient.values)) - u_slice u.scatter_add(tf.IndexedSlices(u_slice_incremental, indices)) variable.assign_sub( (lr * m) / ((1 - beta_1_power) * (u + self.epsilon))) else: # Dense gradients. m.assign_add((gradient - m) * (1 - self.beta_1)) u.assign(tf.maximum(self.beta_2 * u, tf.abs(gradient))) variable.assign_sub( (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" lr = tf.cast(self.learning_rate, variable.dtype) local_step = tf.cast(self.iterations + 1, variable.dtype) beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step) var_key = self._var_key(variable) m = self._m[self._index_dict[var_key]] u = self._u[self._index_dict[var_key]] if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. indices = gradient.indices m.assign_add(-m * (1 - self.beta_1)) m.scatter_add( tf.IndexedSlices(gradient.values * (1 - self.beta_1), indices)) u.assign(u * self.beta_2) u_slice = tf.gather(u, indices) u_slice_incremental = ( tf.maximum(u_slice, tf.abs(gradient.values)) - u_slice) u.scatter_add(tf.IndexedSlices(u_slice_incremental, indices)) variable.assign_sub( (lr * m) / ((1 - beta_1_power) * (u + self.epsilon))) else: # Dense gradients. m.assign_add((gradient - m) * (1 - self.beta_1)) u.assign(tf.maximum(self.beta_2 * u, tf.abs(gradient))) variable.assign_sub( (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
def testSparseRepeatedIndices(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. for dtype in [tf.half, tf.float32, tf.float64]: with tf.Graph().as_default(), self.cached_session(): repeated_index_update_var = tf.Variable([[1.0], [2.0]], dtype=dtype) aggregated_update_var = tf.Variable([[1.0], [2.0]], dtype=dtype) grad_repeated_index = tf.IndexedSlices( tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype), tf.constant([1, 1]), tf.constant([2, 1])) grad_aggregated = tf.IndexedSlices( tf.constant([0.2], shape=[1, 1], dtype=dtype), tf.constant([1]), tf.constant([2, 1])) repeated_update = adamax.Adamax().apply_gradients([ (grad_repeated_index, repeated_index_update_var) ]) aggregated_update = adamax.Adamax().apply_gradients([ (grad_aggregated, aggregated_update_var) ]) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(aggregated_update_var, repeated_index_update_var.eval()) for _ in range(3): repeated_update.run() aggregated_update.run() self.assertAllClose(aggregated_update_var, repeated_index_update_var.eval())
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" var_dtype = variable.dtype lr = tf.cast(self.learning_rate, var_dtype) local_step = tf.cast(self.iterations + 1, var_dtype) next_step = tf.cast(self.iterations + 2, var_dtype) decay = tf.cast(0.96, var_dtype) beta_1 = tf.cast(self.beta_1, var_dtype) beta_2 = tf.cast(self.beta_2, var_dtype) u_t = beta_1 * (1.0 - 0.5 * (tf.pow(decay, local_step))) u_t_1 = beta_1 * (1.0 - 0.5 * (tf.pow(decay, next_step))) def get_cached_u_product(): return self._u_product def compute_new_u_product(): u_product_t = self._u_product * u_t self._u_product.assign(u_product_t) self._u_product_counter += 1 return u_product_t u_product_t = tf.cond( self._u_product_counter == (self.iterations + 2), true_fn=get_cached_u_product, false_fn=compute_new_u_product, ) u_product_t_1 = u_product_t * u_t_1 beta_2_power = tf.pow(beta_2, local_step) var_key = self._var_key(variable) m = self._momentums[self._index_dict[var_key]] v = self._velocities[self._index_dict[var_key]] if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. m.assign_add(-m * (1 - beta_1)) m.scatter_add( tf.IndexedSlices(gradient.values * (1 - beta_1), gradient.indices)) v.assign_add(-v * (1 - beta_2)) v.scatter_add( tf.IndexedSlices( tf.square(gradient.values) * (1 - beta_2), gradient.indices)) m_hat = u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient / ( 1 - u_product_t) v_hat = v / (1 - beta_2_power) variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon)) else: # Dense gradients. m.assign_add((gradient - m) * (1 - beta_1)) v.assign_add((tf.square(gradient) - v) * (1 - beta_2)) m_hat = u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient / ( 1 - u_product_t) v_hat = v / (1 - beta_2_power) variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon))
def testResourceSparse(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. for dtype in [tf.half, tf.float32, tf.float64]: with tf.Graph().as_default(), self.cached_session(): # Initialize variables for numpy implementation. zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype) # pylint: disable=cell-var-from-loop m0, v0, m1, v1 = ( zero_slots(), zero_slots(), zero_slots(), zero_slots(), ) var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np) var1 = tf.Variable(var1_np) grads0_np_indices = np.array([0, 1], dtype=np.int32) grads0 = tf.IndexedSlices( tf.constant(grads0_np), tf.constant(grads0_np_indices), tf.constant([3]), ) grads1_np_indices = np.array([2, 1], dtype=np.int32) grads1 = tf.IndexedSlices( tf.constant(grads1_np), tf.constant(grads1_np_indices), tf.constant([3]), ) opt = adamax.Adamax() update = opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) # Fetch params to validate initial values self.assertAllClose([1.0, 2.0, 3.0], var0) self.assertAllClose([4.0, 5.0, 6.0], var1) beta1_power = get_beta_accumulators(opt, dtype) # Run 3 steps of Adamax for t in range(3): self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power) update.run() var0_np, m0, v0 = adamax_sparse_update_numpy( var0_np, grads0_np_indices, grads0_np, t, m0, v0) var1_np, m1, v1 = adamax_sparse_update_numpy( var1_np, grads1_np_indices, grads1_np, t, m1, v1) # Validate updated params self.assertAllCloseAccordingToType(var0_np, var0) self.assertAllCloseAccordingToType(var1_np, var1)
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" lr = tf.cast(self.learning_rate, variable.dtype) var_key = self._var_key(variable) velocity = self._velocities[self._index_dict[var_key]] momentum = None if self.momentum > 0: momentum = self._momentums[self._index_dict[var_key]] average_grad = None if self.centered: average_grad = self._average_gradients[self._index_dict[var_key]] rho = self.rho if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. velocity.assign(rho * velocity) velocity.scatter_add( tf.IndexedSlices( tf.square(gradient.values) * (1 - rho), gradient.indices)) if self.centered: average_grad.assign(rho * average_grad) average_grad.scatter_add( tf.IndexedSlices( tf.square(gradient.values) * (1 - rho), gradient.indices)) velocity.assign_add(-tf.square(average_grad)) velocity_value = tf.gather(velocity, gradient.indices) transformed_grad = tf.IndexedSlices( gradient.values / (tf.sqrt(velocity_value) + self.epsilon), gradient.indices, ) if self.momentum > 0: momentum.assign(self.momentum * momentum) momentum.scatter_add(transformed_grad) variable.assign_add(-lr * momentum) else: variable.scatter_add( tf.IndexedSlices(-lr * transformed_grad.values, transformed_grad.indices)) else: # Dense gradients. velocity.assign(rho * velocity + (1 - rho) * tf.square(gradient)) if self.centered: average_grad.assign(rho * average_grad + (1 - rho) * tf.square(gradient)) velocity.assign_add(-tf.square(average_grad)) transformed_grad = gradient / (tf.sqrt(velocity) + self.epsilon) if self.momentum > 0: momentum.assign(self.momentum * momentum + transformed_grad) variable.assign_add(-lr * momentum) else: variable.assign_add(-lr * transformed_grad)
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" beta_1_power = None beta_2_power = None lr = tf.cast(self.learning_rate, variable.dtype) local_step = tf.cast(self.iterations + 1, variable.dtype) beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step) beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step) var_key = self._var_key(variable) m = self._momentums[self._index_dict[var_key]] v = self._velocities[self._index_dict[var_key]] alpha = lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power) # Apply step weight decay if ( self.weight_decay != 0 and variable not in self._exclude_from_weight_decay ): wd = tf.cast(self.weight_decay, variable.dtype) variable.assign_sub(variable * wd) if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. m.assign_add(-m * (1 - self.beta_1)) m.scatter_add( tf.IndexedSlices( gradient.values * (1 - self.beta_1), gradient.indices ) ) v.assign_add(-v * (1 - self.beta_2)) v.scatter_add( tf.IndexedSlices( tf.square(gradient.values) * (1 - self.beta_2), gradient.indices, ) ) if self.amsgrad: v_hat = self._velocity_hats[self._index_dict[var_key]] v_hat.assign(tf.maximum(v_hat, v)) v = v_hat variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon)) else: # Dense gradients. m.assign_add((gradient - m) * (1 - self.beta_1)) v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2)) if self.amsgrad: v_hat = self._velocity_hats[self._index_dict[var_key]] v_hat.assign(tf.maximum(v_hat, v)) v = v_hat variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" if self._var_key(variable) not in self._index_dict: raise KeyError( f'Optimizer cannot recognize variable {variable.name}, ' f'this usually means you are calling an optimizer ' f'previously used on a different model. Please try ' f'creating a new optimizer instance.') beta_1_power = None beta_2_power = None lr = tf.cast(self.learning_rate, variable.dtype) local_step = tf.cast(self.iterations + 1, variable.dtype) beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step) beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step) var_key = self._var_key(variable) m = self._momentums[self._index_dict[var_key]] v = self._velocities[self._index_dict[var_key]] alpha = (lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)) # Apply step weight decay if self.weight_decay != 0: wd = tf.cast(self.weight_decay, variable.dtype) variable.assign_sub(variable * (1 - lr * wd)) if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. m.assign_add(-m * (1 - self.beta_1)) m.scatter_add( tf.IndexedSlices(gradient.values * (1 - self.beta_1), gradient.indices)) v.assign_add(-v * (1 - self.beta_2)) v.scatter_add( tf.IndexedSlices( tf.square(gradient.values) * (1 - self.beta_2), gradient.indices)) if self.amsgrad: v_hat = self._velocity_hats[self._index_dict[var_key]] v_hat.assign(tf.maximum(v_hat, v)) v = v_hat variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon)) else: # Dense gradients. m.assign_add((gradient - m) * (1 - self.beta_1)) v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2)) if self.amsgrad: v_hat = self._velocity_hats[self._index_dict[var_key]] v_hat.assign(tf.maximum(v_hat, v)) v = v_hat variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
def testSparseBasic(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. with tf.Graph().as_default(): for dtype in _DATA_TYPES: var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np) var1 = tf.Variable(var1_np) grads0_np_indices = np.array([0, 2], dtype=np.int32) grads0 = tf.IndexedSlices( tf.constant(grads0_np[grads0_np_indices]), tf.constant(grads0_np_indices), tf.constant([3])) grads1_np_indices = np.array([0, 2], dtype=np.int32) grads1 = tf.IndexedSlices( tf.constant(grads1_np[grads1_np_indices]), tf.constant(grads1_np_indices), tf.constant([3])) learning_rate = 3.0 ada_opt = adagrad.Adagrad(learning_rate) ada_update = ada_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) # Fetch params to validate initial values self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1)) accum0_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype) accum1_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype) # Run 3 step of sgd for _ in range(3): self.evaluate(ada_update) var0_np, accum0_np = sparse_adagrad_update_numpy( var0_np, accum0_np, grads0_np_indices, grads0_np[grads0_np_indices], learning_rate) var1_np, accum1_np = sparse_adagrad_update_numpy( var1_np, accum1_np, grads1_np_indices, grads1_np[grads1_np_indices], learning_rate) self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
def testSparseWithAmsgrad(self): # dtypes.half does not work on gpu + eager. for dtype in [tf.float32, tf.float64]: with self.cached_session(): m0 = np.array([[0.0], [0.0]]) v0 = np.array([[0.0], [0.0]]) v0hat = np.array([[0.0], [0.0]]) indices_np = np.array([1]) indices = tf.constant(indices_np, dtype=tf.int32) var0_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype) repeated_index_update_var = tf.Variable(var0_np, dtype=dtype) aggregated_update_var = tf.Variable(var0_np, dtype=dtype) grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype) grad_repeated_index = tf.IndexedSlices( tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype), tf.constant([1, 1]), tf.constant([2, 1])) grad_aggregated = tf.IndexedSlices(grads0_np, indices, tf.constant([2, 1])) opt_repeated = adam.NonFusedAdam(amsgrad=True) opt_aggregated = adam.NonFusedAdam(amsgrad=True) if not tf.executing_eagerly(): repeated_update = opt_repeated.apply_gradients( [(grad_repeated_index, repeated_index_update_var)]) aggregated_update = opt_aggregated.apply_gradients( [(grad_aggregated, aggregated_update_var)]) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose( self.evaluate(aggregated_update_var), self.evaluate(repeated_index_update_var)) for t in range(3): if not tf.executing_eagerly(): self.evaluate(repeated_update) self.evaluate(aggregated_update) else: opt_repeated.apply_gradients( [(grad_repeated_index, repeated_index_update_var)]) opt_aggregated.apply_gradients( [(grad_aggregated, aggregated_update_var)]) var0_np, m0, v0, v0hat = adam_sparse_update_numpy_amsgrad( var0_np, indices_np, grads0_np, t, m0, v0, v0hat) # Validate updated params self.assertAllCloseAccordingToType( var0_np, self.evaluate(aggregated_update_var)) self.assertAllCloseAccordingToType( self.evaluate(aggregated_update_var), self.evaluate(repeated_index_update_var))
def update_step(self, grad, variable): """Update step given gradient and the associated model variable.""" if self._var_key(variable) not in self._index_dict: raise KeyError(f'Optimizer cannot recognize variable {variable.name}, ' f'this usually means you are calling an optimizer ' f'previously used on a different model. Please try ' f'creating a new optimizer instance.') lr = tf.cast(self.learning_rate, variable.dtype) var_key = self._var_key(variable) rho = self.rho accumulated_grad = self._accumulated_grads[self._index_dict[var_key]] accumulated_delta_var = self._accumulated_delta_vars[ self._index_dict[var_key]] def rms(x): return tf.sqrt(x + self.epsilon) if isinstance(grad, tf.IndexedSlices): # Sparse gradients. accumulated_grad.assign_add((rho - 1) * accumulated_grad) accumulated_grad.scatter_add(tf.IndexedSlices( (1 - rho) * tf.square(grad.values), grad.indices)) delta_var = -rms(accumulated_delta_var) * grad / rms(accumulated_grad) accumulated_delta_var.assign(rho * accumulated_delta_var + (1 - rho) * delta_var * delta_var) else: # Dense gradients. accumulated_grad.assign(rho * accumulated_grad + (1 - rho) * grad * grad) delta_var = -rms(accumulated_delta_var) * grad / rms(accumulated_grad) accumulated_delta_var.assign(rho * accumulated_delta_var + (1 - rho) * delta_var * delta_var) variable.assign_add(lr * delta_var)
def run_sparse_sample(iterations, expected, optimizer): var_0 = tf.Variable([1.0, 2.0]) var_1 = tf.Variable([3.0, 4.0]) grad_0 = tf.IndexedSlices(tf.constant([0.1]), tf.constant([0]), tf.constant([2])) grad_1 = tf.IndexedSlices(tf.constant([0.04]), tf.constant([1]), tf.constant([2])) grads_and_vars = list(zip([grad_0, grad_1], [var_0, var_1])) for _ in range(iterations): optimizer.apply_gradients(grads_and_vars) np.testing.assert_allclose(var_0.read_value(), expected[0], atol=2e-4) np.testing.assert_allclose(var_1.read_value(), expected[1], atol=2e-4)
def update_step(self, grad, variable): """Update step given gradient and the associated model variable.""" lr = tf.cast(self.learning_rate, variable.dtype) var_key = self._var_key(variable) rho = self.rho accumulated_grad = self._accumulated_grads[self._index_dict[var_key]] accumulated_delta_var = self._accumulated_delta_vars[ self._index_dict[var_key]] def rms(x): return tf.sqrt(x + self.epsilon) if isinstance(grad, tf.IndexedSlices): # Sparse gradients. accumulated_grad.assign_add((rho - 1) * accumulated_grad) accumulated_grad.scatter_add( tf.IndexedSlices((1 - rho) * tf.square(grad.values), grad.indices)) delta_var = (-rms(accumulated_delta_var) * grad / rms(accumulated_grad)) accumulated_delta_var.assign(rho * accumulated_delta_var + (1 - rho) * delta_var * delta_var) else: # Dense gradients. accumulated_grad.assign(rho * accumulated_grad + (1 - rho) * grad * grad) delta_var = (-rms(accumulated_delta_var) * grad / rms(accumulated_grad)) accumulated_delta_var.assign(rho * accumulated_delta_var + (1 - rho) * delta_var * delta_var) variable.assign_add(lr * delta_var)
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" lr = tf.cast(self.learning_rate, variable.dtype) m = None var_key = self._var_key(variable) if self.momentum != 0: momentum = tf.cast(self.momentum, variable.dtype) m = self.momentums[self._index_dict[var_key]] # TODO(b/204321487): Add nesterov acceleration. if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. add_value = tf.IndexedSlices(-gradient.values * lr, gradient.indices) if m is not None: m.assign(m * momentum) m.scatter_add(add_value) if self.nesterov: variable.scatter_add(add_value) variable.assign_add(m * momentum) else: variable.assign_add(m) else: variable.scatter_add(add_value) else: # Dense gradients if m is not None: m.assign(-gradient * lr + m * momentum) if self.nesterov: variable.assign_add(-gradient * lr + m * momentum) else: variable.assign_add(m) else: variable.assign_add(-gradient * lr)
def testSparseStability(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. with tf.Graph().as_default(): for dtype in [tf.half]: shape = [1, 6] var0_np = np.array([[ 0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257, -0.0105945 ]], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np) grads0_np = np.array([[ -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05, -8.4877e-05, -9.48906e-05 ]], dtype=dtype.as_numpy_dtype) grads0 = tf.IndexedSlices(tf.constant(grads0_np), tf.constant([0]), tf.constant(shape)) ada_opt = adagrad.Adagrad(1.0) ada_update = ada_opt.apply_gradients(zip([grads0], [var0])) slot0 = ada_opt.get_slot(var0, "accumulator") init = tf.compat.v1.global_variables_initializer() for _ in range(100): self.evaluate(init) self.evaluate(ada_update) self.assertAllCloseAccordingToType( np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), self.evaluate(slot0)) self.assertAllCloseAccordingToType( np.array([[ 0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573, -0.01029443 ]]), self.evaluate(var0))
def testSparse(self): # TODO(tanzheny, omalleyt): Fix test in eager mode. sparse_epsilon = 1e-7 for dtype in [tf.half, tf.float32, tf.float64]: with tf.Graph().as_default(), self.cached_session(): # Initialize variables for numpy implementation. m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0 var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np) var1 = tf.Variable(var1_np) grads0_np_indices = np.array([0, 2], dtype=np.int32) grads0 = tf.IndexedSlices( tf.constant(grads0_np[grads0_np_indices]), tf.constant(grads0_np_indices), tf.constant([3])) grads1_np_indices = np.array([0, 2], dtype=np.int32) grads1 = tf.IndexedSlices( tf.constant(grads1_np[grads1_np_indices]), tf.constant(grads1_np_indices), tf.constant([3])) opt = nadam.Nadam(epsilon=sparse_epsilon) update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) # Fetch params to validate initial values self.assertAllClose([1.0, 1.0, 2.0], var0) self.assertAllClose([3.0, 3.0, 4.0], var1) beta1_power, beta2_power = get_beta_accumulators(opt, dtype) # Run 3 steps of Nadam for t in range(3): self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power) self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power) update.run() mcache = update_m_cache(mcache, t) var0_np, m0, v0 = nadam_update_numpy( var0_np, grads0_np, t, m0, v0, mcache, epsilon=sparse_epsilon) var1_np, m1, v1 = nadam_update_numpy( var1_np, grads1_np, t, m1, v1, mcache, epsilon=sparse_epsilon) # Validate updated params self.assertAllCloseAccordingToType(var0_np, var0) self.assertAllCloseAccordingToType(var1_np, var1)
def testIndexedSlices(self): dtype = tf.int64 iss = tf.IndexedSlices(values=tf.ones([2, 3], dtype=dtype), indices=tf.constant([1, 9]), dense_shape=[10, 3]) a = array_ops.array(iss, copy=False) expected = tf.scatter_nd([[1], [9]], tf.ones([2, 3], dtype=dtype), [10, 3]) self.assertAllEqual(expected, a)
def _multiply_gradient(gradient, scale): """Multiply a (possibly sparse) gradient by the given scale factor.""" scale = tf.cast(scale, gradient.dtype) if isinstance(gradient, tf.IndexedSlices): return tf.IndexedSlices(gradient.values * scale, gradient.indices, dense_shape=gradient.dense_shape) else: return gradient * scale
def select(self, step): """Returns the index of the selected representation for a training step.""" if step - self.last_selection_step >= self.sample_freq: self.current_selection.assign(self._select()) self.last_selection_step.assign(step) # Increment the counter for the newly selected item. self.selection_counter.scatter_add( tf.IndexedSlices(1, self.current_selection)) return self.current_selection.numpy()
def testFtrlWithL1_L2_L2ShrinkageSparse(self): """Tests the new FTRL op with support for l2 shrinkage on sparse grads.""" # TODO(tanzheny, omalleyt): Fix test in eager mode. for dtype in [tf.half, tf.float32]: with tf.Graph().as_default(), self.cached_session(): var0 = tf.Variable([[1.0], [2.0]], dtype=dtype) var1 = tf.Variable([[4.0], [3.0]], dtype=dtype) grads0 = tf.IndexedSlices( tf.constant([0.1], shape=[1, 1], dtype=dtype), tf.constant([0]), tf.constant([2, 1]), ) grads1 = tf.IndexedSlices( tf.constant([0.02], shape=[1, 1], dtype=dtype), tf.constant([1]), tf.constant([2, 1]), ) opt = ftrl.Ftrl( 3.0, initial_accumulator_value=0.1, l1_regularization_strength=0.001, l2_regularization_strength=2.0, l2_shrinkage_regularization_strength=0.1, ) update = opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) v0_val, v1_val = self.evaluate([var0, var1]) self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val) self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val) # Run 10 steps FTRL for _ in range(10): update.run() v0_val, v1_val = self.evaluate([var0, var1]) self.assertAllCloseAccordingToType([[-0.22578995], [2.0]], v0_val) self.assertAllCloseAccordingToType([[4.0], [-0.13229476]], v1_val)
def testGetUnscaledSparseGradients(self, opt_cls): opt = create_sgd(opt_cls) opt = create_lso(opt, dynamic=False, initial_scale=2) sparse_scaled_grad = tf.IndexedSlices( tf.convert_to_tensor([[4., 2.], [8., 5.]]), tf.convert_to_tensor([1, 3], dtype='int32'), dense_shape=tf.convert_to_tensor([5, 2], dtype='int32')) sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0] self.assertIsInstance(sparse_grad, tf.IndexedSlices) self.assertAllEqual([[2., 1.], [4., 2.5]], self.evaluate(sparse_grad.values))
def testSparseBasic(self): for dtype in [tf.half, tf.float32, tf.float64]: with self.cached_session(): var0 = tf.Variable([[1.1], [2.1]], dtype=dtype) var1 = tf.Variable([[3.0], [4.0]], dtype=dtype) grads0 = tf.IndexedSlices( tf.constant([0.1], shape=[1, 1], dtype=dtype), tf.constant([0]), tf.constant([2, 1])) grads1 = tf.IndexedSlices( tf.constant([0.01], shape=[1, 1], dtype=dtype), tf.constant([1]), tf.constant([2, 1])) decay_rate = 0.1 batch_size = 2 total_num_examples = 10 sgd_optimizer = tfp.optimizer.VariationalSGD( batch_size, total_num_examples, max_learning_rate=3.0, burnin=0, preconditioner_decay_rate=decay_rate) if not tf.executing_eagerly(): sgd_op = sgd_optimizer.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf1.global_variables_initializer()) # Fetch params to validate initial values self.assertAllCloseAccordingToType([[1.1], [2.1]], self.evaluate(var0)) self.assertAllCloseAccordingToType([[3.0], [4.0]], self.evaluate(var1)) # Run 1 step of sgd if not tf.executing_eagerly(): self.evaluate(sgd_op) else: sgd_optimizer.apply_gradients( zip([grads0, grads1], [var0, var1])) # Validate updated params self.assertAllCloseAccordingToType([[1.1 - 3. * 0.1], [2.1]], self.evaluate(var0)) self.assertAllCloseAccordingToType( [[3. - 3. * 0], [4. - 3. * 0.01]], self.evaluate(var1))
def _resource_apply_sparse(self, grad, var, indices): max_learning_rate = tf.where( self.iterations < tf.cast(self._burnin, tf.int64), self._burnin_max_learning_rate, self._max_learning_rate) learn_rate = tf.clip_by_value( self._get_coordinatewise_learning_rate( tf.IndexedSlices(grad, indices), var), 0., tf.cast(max_learning_rate, var.dtype)) delta = grad * learn_rate return self._resource_scatter_add(var, indices, -delta)
def testGetUnscaledSparseGradients(self): opt = gradient_descent.SGD(2.0) opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False, initial_scale=2) sparse_scaled_grad = tf.IndexedSlices( tf.convert_to_tensor([[4., 2.], [8., 5.]]), tf.convert_to_tensor([1, 3], dtype='int32'), dense_shape=tf.convert_to_tensor([5, 2], dtype='int32')) sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0] self.assertIsInstance(sparse_grad, tf.IndexedSlices) self.assertAllEqual([[2., 1.], [4., 2.5]], self.evaluate(sparse_grad.values))