class CDGD(Optimizer): _HAS_AGGREGATE_GRAD = True first = True def __init__(self, learning_rate=0.005, momentum=0.0, nb_agents=5, params=None, nesterov=False, name="CDGD", c1=0.5, delta=0.48, **kwargs): super(CDGD, self).__init__(False, name) #self._set_hyper("learning_rate", kwargs.get("lr", 1)) #self._set_hyper("decay", self._initial_decay) self._momentum = False if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0: self._momentum = True if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1): raise ValueError("`momentum` must be between [0, 1].") #self._set_hyper("momentum", momentum) self.nesterov = nesterov self.learning_rate = learning_rate self.nb_agents = nb_agents if params == None: self.params = Params(nb_agents, 1) else: self.params = params self.epochStart = True self.c1 = c1 self.delta = delta def _create_slots(self, var_list): for var in var_list: self._zeros_slot(var, "epoch_var", self._name) if self._momentum: for var in var_list: self._zeros_slot(var, "momentum", self._name) def get_slot(self, var, name): return self._get_or_make_slot(var, var, name, self._name) def set_slot(self, var, name, val): m = self._get_or_make_slot(var, None, name, self._name) return m.assign(val) def CDGrads(self, converted_grads_and_vars, step): grad_list, var_list, processor = zip(*converted_grads_and_vars) layers = int(len(var_list) / (2 * self.nb_agents)) val_list = [None] * len(grad_list) pi = self.params.genPi() bi = self.params.genBi() # Randomly generated column sch matrix rand_s = self.params.genRand() # Gen single value since gradient is list # rand_s = self.params.genRand(len(grad_list)) # Random matrix with size of gradient # tf.print(rand_s) grad_list = list(grad_list) floor_v = -1 # Decaying epsilon (0.5) # epsilon = tf.math.maximum(self.c1 / tf.math.pow(tf.cast(step + 1, tf.float32), 0.4), floor_v) # Constant Epsilon # epsilon = self.c1 # Constant Step size lambda = 0.01 # alpha = tf.cast(self.learning_rate, tf.float32) # Decaying stepsize lambda # alpha = tf.math.maximum(1 / tf.cast(step + 1, tf.float32), floor_v) # alpha = tf.math.maximum(0.5 / tf.math.pow(tf.cast(step + 1, tf.float32), .3), floor_v) alpha = tf.math.maximum(.25 / (tf.cast(step, tf.float32) * 0.007 + 1), floor_v) # tf.print(alpha) # alpha = tf.math.maximum(.5 / tf.math.pow(tf.cast(step, tf.float32) * self.c1 + 1, 0.3), floor_v) # Decaying stepsize hiding lambda (1/2 pow?) # alpha = tf.math.maximum(1 / tf.math.pow(tf.cast(step + 1, tf.float32), 1/2), floor_v) + (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 3/2), floor_v)) # tf.print(alpha) for k in range(layers): for i in range(self.nb_agents): # newVal = var_list[2*(i + self.nb_agents * k)] # newValB = var_list[2*(i + self.nb_agents * k) + 1] newVal = 0 newValB = 0 # print(var_list[2*(i + self.nb_agents * k)].get_shape().as_list()) # print(var_list[2*(i + self.nb_agents * k)+1].get_shape().as_list()) for j in range(self.nb_agents): jvar = var_list[2*(j + self.nb_agents * k)] jvarb = var_list[2*(j + self.nb_agents * k) + 1] # Generate term for each update: alg method # rand_s = numpy.random.random_sample(grad_list[2*(i + self.nb_agents * k)].get_shape().as_list()) # alpha1 = tf.math.maximum(1 / tf.math.pow(tf.cast(step + 1, tf.float32), 1), floor_v) * numpy.ones(grad_list[2*(i + self.nb_agents * k)].get_shape().as_list()) + (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 2), floor_v)) # rand_s = numpy.random.random_sample(grad_list[2*(i + self.nb_agents * k) + 1].get_shape().as_list()) # alpha2 = tf.math.maximum(1 / tf.math.pow(tf.cast(step + 1, tf.float32), 1), floor_v) * numpy.ones(grad_list[2*(i + self.nb_agents * k)+1].get_shape().as_list())+ (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 2), floor_v)) # Alpha for non alg method # alpha = tf.math.maximum(1 / tf.math.pow(tf.cast(step + 1, tf.float32), 1), floor_v) if i == j: # Alg method # newVal = newVal + pi[i, j] * var_list[2*(i + self.nb_agents * k)] - bi[i, j] * alpha1 * grad_list[2*(i + self.nb_agents * k)] # newValB = newValB + pi[i, j] * var_list[2*(i + self.nb_agents * k) + 1] - bi[i, j] * alpha2 * grad_list[2*(i + self.nb_agents * k) + 1] # Non alg method newVal = newVal + pi[i, j] * var_list[2*(i + self.nb_agents * k)] newValB = newValB + pi[i, j] * var_list[2*(i + self.nb_agents * k) + 1] else: # Alg method # newVal = newVal + pi[i,j] * jvar - bi[i, j] * alpha1 * grad_list[2*(j + self.nb_agents * k)] # newValB = newValB + pi[i,j] * jvarb - bi[i, j] * alpha2 * grad_list[2*(j + self.nb_agents * k) + 1] # Non alg method newVal = newVal + pi[i,j] * jvar newValB = newValB + pi[i,j] * jvarb # alg method # val_list[2*(i + self.nb_agents * k)] = newVal # val_list[2*(i + self.nb_agents * k) + 1] = newValB # None alg val_list[2*(i + self.nb_agents * k)] = newVal - alpha * grad_list[2*(i + self.nb_agents * k)] val_list[2*(i + self.nb_agents * k) + 1] = newValB - alpha * grad_list[2*(i + self.nb_agents * k) + 1] return zip(grad_list, var_list, processor, val_list) def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This is the second part of `minimize()`. It returns an `Operation` that applies gradients. Args: grads_and_vars: List of (gradient, variable) pairs as returned by `compute_gradients()`. global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the `Optimizer` constructor. Returns: An `Operation` that applies the specified gradients. If `global_step` was not None, that operation also increments `global_step`. Raises: TypeError: If `grads_and_vars` is malformed. ValueError: If none of the variables have gradients. RuntimeError: If you should use `_distributed_apply()` instead. """ # This is a default implementation of apply_gradients() that can be shared # by most optimizers. It relies on the subclass implementing the following # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse(). # TODO(isaprykin): Get rid of `has_strategy()` check by # always calling _distributed_apply(), using the default distribution # as needed. self.epochStart = self.params.epochStart self.params.epochStart = False if distribute_ctx.has_strategy(): # Handle DistributionStrategy case. if distribute_ctx.in_cross_replica_context(): raise RuntimeError("Use `_distributed_apply()` instead of " "`apply_gradients()` in a cross-replica context.") grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)() return distribute_ctx.get_replica_context().merge_call( self._distributed_apply, args=(grads_and_vars, global_step, name)) # No DistributionStrategy case. grads_and_vars = tuple(grads_and_vars) # Make sure repeat iteration works. if not grads_and_vars: raise ValueError("No variables provided.") converted_grads_and_vars = [] for g, v in grads_and_vars: if g is not None: try: # Convert the grad to Tensor or IndexedSlices if necessary. g = ops.convert_to_tensor_or_indexed_slices(g) except TypeError: raise TypeError( "Gradient must be convertible to a Tensor" " or IndexedSlices, or None: %s" % g) if not isinstance(g, (ops.Tensor, ops.IndexedSlices)): raise TypeError( "Gradient must be a Tensor, IndexedSlices, or None: %s" % g) p = _get_processor(v) converted_grads_and_vars.append((g, v, p)) converted_grads_and_vars = tuple(converted_grads_and_vars) grad_list = [g for g, v, _ in converted_grads_and_vars if g is not None] var_list = [v for g, v, _ in converted_grads_and_vars if g is not None] if not var_list: raise ValueError("No gradients provided for any variable: %s." % ([str(v) for _, v, _ in converted_grads_and_vars],)) with ops.init_scope(): self._create_slots(var_list) # Imp function compGV = self.CDGrads(converted_grads_and_vars, global_step) update_ops = [] with ops.name_scope(name, self._name, skip_on_eager=False) as name: self._prepare() for grad, var, processor, val in compGV: if grad is None: continue # We colocate all ops created in _apply_dense or _apply_sparse # on the same device as the variable. # TODO(apassos): figure out how to get the variable name here. if (context.executing_eagerly() or resource_variable_ops.is_resource_variable(var) and not var._in_graph_mode): # pylint: disable=protected-access scope_name = "" else: scope_name = var.op.name with ops.name_scope( "update_" + scope_name, skip_on_eager=False), ops.colocate_with(var): if self.epochStart and False: update_ops.append(self.set_slot(var, "epoch_var", val)) update_ops.append(var.assign(val)) if global_step is None: apply_updates = self._finish(update_ops, name) else: with ops.control_dependencies([self._finish(update_ops, "update")]): with ops.colocate_with(global_step): if isinstance(global_step, resource_variable_ops.BaseResourceVariable): apply_updates = resource_variable_ops.assign_add_variable_op( global_step.handle, ops.convert_to_tensor(1, dtype=global_step.dtype), name=name) else: apply_updates = state_ops.assign_add(global_step, 1, name=name) if not context.executing_eagerly(): if isinstance(apply_updates, ops.Tensor): apply_updates = apply_updates.op train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) if apply_updates not in train_op: train_op.append(apply_updates) if self.epochStart: self.epochStart = False return apply_updates def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype return training_ops.resource_apply_gradient_descent( var.handle, 1.0, grad, use_locking=self._use_locking) def get_config(self): config = super(SGD, self).get_config() config.update({ "learning_rate": self._serialize_hyperparameter("learning_rate"), "decay": self._serialize_hyperparameter("decay"), "momentum": self._serialize_hyperparameter("momentum"), "nesterov": self.nesterov, }) return config
class QCDGD(Optimizer): _HAS_AGGREGATE_GRAD = True first = True def __init__(self, learning_rate=0.01, momentum=0.0, nb_agents=5, params=None, nesterov=False, name="QCDGD", clip=0, ternSt=0, c1=0.5, delta=0.48, **kwargs): super(QCDGD, self).__init__(False, name) #self._set_hyper("learning_rate", kwargs.get("lr", 1)) #self._set_hyper("decay", self._initial_decay) self._momentum = False if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0: self._momentum = True if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1): raise ValueError("`momentum` must be between [0, 1].") #self._set_hyper("momentum", momentum) self.nesterov = nesterov self.learning_rate = learning_rate self.nb_agents = nb_agents if params == None: self.params = Params(nb_agents, 1) else: self.params = params self.epochStart = True self.clipSTD = clip self.stMultiplier = ternSt self.c1 = c1 self.delta = delta def _create_slots(self, var_list): for var in var_list: self._zeros_slot(var, "epoch_var", self._name) if self._momentum: for var in var_list: self._zeros_slot(var, "momentum", self._name) def get_slot(self, var, name): return self._get_or_make_slot(var, var, name, self._name) def set_slot(self, var, name, val): m = self._get_or_make_slot(var, None, name, self._name) return m.assign(val) def clip(self, var, c): if c == 0: return var std = tf.math.reduce_std(var) return tf.clip_by_value(var, -c * std, c * std, name=None) def getST(self, var): return tf.math.reduce_max(tf.math.abs(var)) def tern(self, var, st): if self.stMultiplier == 0: return var bernoulli = tf.math.abs(var) / st dist = tfp.distributions.Bernoulli(probs=bernoulli, dtype=tf.float32) bt = dist.sample() return tf.math.multiply(tf.math.sign(var), bt) * st # def st_tot() def CDGrads(self, converted_grads_and_vars, step): grad_list, var_list, processor = zip(*converted_grads_and_vars) layers = int(len(var_list) / (2 * self.nb_agents)) val_list = [None] * len(grad_list) comb_list = [None] * len(grad_list) tern_list = [None] * len(grad_list) reg_list = [None] * len(grad_list) pi = self.params.genPi() bi = self.params.genBi() # Randomly generated column sch matrix # bi = self.params.genDiag() # Diagonal Matrix rand_s = self.params.genRand( ) # Gen single value since gradient is list # tf.print(rand_s) grad_list = list(grad_list) # floor_v = 0.0005 # floor_v = .0005 # floor_v = 0.001 floor_v = -1 # Norm = 0.5 numerator = .5 e_decimal = 0.7 epsilon = numerator / tf.math.pow( tf.cast(step + 1, tf.float32) * self.c1 + 1, e_decimal) # epsilon = tf.math.maximum(self.c1 / tf.math.pow(tf.cast(step + 1, tf.float32), self.delta), floor_v) # epsilon = self.c1 # tf.print(epsilon) # Usually 1 before term # rand_s = numpy.random.random_sample(grad_list[2*(i + self.nb_agents * k)].get_shape().as_list()) # lamb1 = .5 / tf.math.pow(tf.cast(step, tf.float32) * self.c1 + 1, 0.3) * numpy.ones(grad_list[2*(i + self.nb_agents * k)].get_shape().as_list()) + (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 2), floor_v)) # rand_s = numpy.random.random_sample(grad_list[2*(i + self.nb_agents * k) + 1].get_shape().as_list()) # lamb2 = .5 / tf.math.pow(tf.cast(step, tf.float32) * self.c1 + 1, 0.3) * numpy.ones(grad_list[2*(i + self.nb_agents * k)+1].get_shape().as_list())+ (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 2), floor_v)) lamb1 = tf.math.maximum( numerator / tf.math.pow( tf.cast(step, tf.float32) * self.c1 + 1, 1 - e_decimal), floor_v) lamb2 = tf.math.maximum( numerator / tf.math.pow( tf.cast(step, tf.float32) * self.c1 + 1, 1 - e_decimal), floor_v) for k in range(layers): st = tf.constant([0], dtype=tf.float32) stb = tf.constant([0], dtype=tf.float32) for i in range(self.nb_agents): for j in range(self.nb_agents): # rand_s = numpy.random.random_sample(grad_list[2*(i + self.nb_agents * k)].get_shape().as_list()) # lamb1 = tf.math.maximum(numerator / tf.math.pow(tf.cast(step, tf.float32) * self.c1 + 1, 1-e_decimal), floor_v) * numpy.ones(grad_list[2*(i + self.nb_agents * k)].get_shape().as_list()) #+ (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 2), floor_v)) # rand_s = numpy.random.random_sample(grad_list[2*(i + self.nb_agents * k) + 1].get_shape().as_list()) # lamb2 = tf.math.maximum(numerator / tf.math.pow(tf.cast(step, tf.float32) * self.c1 + 1, 1-e_decimal), floor_v) * numpy.ones(grad_list[2*(i + self.nb_agents * k)+1].get_shape().as_list()) #+ (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 2), floor_v)) if i != j: tern_var = pi[i, j] * var_list[ 2 * (j + self.nb_agents * k)] - bi[ i, j] * lamb1 * grad_list[2 * (j + self.nb_agents * k)] tern_varb = pi[i, j] * var_list[ 2 * (j + self.nb_agents * k) + 1] - bi[i, j] * lamb2 * grad_list[ 2 * (j + self.nb_agents * k) + 1] clip_var = self.clip(tern_var, self.clipSTD) clip_varb = self.clip(tern_varb, self.clipSTD) st = tf.math.maximum(st, self.getST(clip_var)) stb = tf.math.maximum(stb, self.getST(clip_varb)) # st[k, i] = st_t * self.stMultiplier # stb_t[k, i] = stb_t * self.stMultiplier for i in range(self.nb_agents): newVal = var_list[2 * (i + self.nb_agents * k)] * (1 - epsilon) newValB = var_list[2 * (i + self.nb_agents * k) + 1] * (1 - epsilon) # Quant initial # newVal = var_list[2*(i + self.nb_agents * k)] # newValB = var_list[2*(i + self.nb_agents * k) + 1] # st = tf.constant([0], dtype=tf.float32) # stb = tf.constant([0], dtype=tf.float32) for j in range(self.nb_agents): # Usually 1 before term # rand_s = numpy.random.random_sample(grad_list[2*(i + self.nb_agents * k)].get_shape().as_list()) # lamb1 = tf.math.maximum(1 / tf.math.pow(tf.cast(step + 1, tf.float32), 1), floor_v) * numpy.ones(grad_list[2*(i + self.nb_agents * k)].get_shape().as_list()) + (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 2), floor_v)) # lamb1 = tf.math.maximum(numerator / tf.math.pow(tf.cast(step, tf.float32) * self.c1 + 1, 1-e_decimal), floor_v) * numpy.ones(grad_list[2*(i + self.nb_agents * k)].get_shape().as_list()) #+ (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 2), floor_v)) # rand_s = numpy.random.random_sample(grad_list[2*(i + self.nb_agents * k) + 1].get_shape().as_list()) # lamb2 = tf.math.maximum(1 / tf.math.pow(tf.cast(step + 1, tf.float32), 1), floor_v) * numpy.ones(grad_list[2*(i + self.nb_agents * k)+1].get_shape().as_list())+ (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 2), floor_v)) # lamb2 = tf.math.maximum(numerator / tf.math.pow(tf.cast(step, tf.float32) * self.c1 + 1, 1-e_decimal), floor_v) * numpy.ones(grad_list[2*(i + self.nb_agents * k)+1].get_shape().as_list()) #+ (rand_s / tf.math.maximum(tf.math.pow(tf.cast(step + 1, tf.float32), 2), floor_v)) if i == j: newVal = newVal + epsilon * pi[i, j] * var_list[ 2 * (i + self.nb_agents * k)] - bi[ i, j] * lamb1 * grad_list[ 2 * (i + self.nb_agents * k)] * epsilon newValB = newValB + epsilon * pi[i, j] * var_list[ 2 * (i + self.nb_agents * k) + 1] - bi[i, j] * lamb2 * grad_list[ 2 * (i + self.nb_agents * k) + 1] * epsilon # Quantized initial part # tern_var = (-1 + pi[i, j]) * var_list[2*(i + self.nb_agents * k)] - bi[i, j] * lamb1 * grad_list[2*(i + self.nb_agents * k)] # tern_varb = (-1 + pi[i, j]) * var_list[2*(i + self.nb_agents * k) + 1] - bi[i, j] * lamb2 * grad_list[2*(i + self.nb_agents * k) + 1] # clip_var = self.clip(tern_var, self.clipSTD) # clip_varb = self.clip(tern_varb, self.clipSTD) # st = tf.math.maximum(st, self.getST(clip_var)) # stb = tf.math.maximum(stb, self.getST(clip_varb)) # # tf.print(st) # q_v = self.tern(clip_var, st) # q_vb = self.tern(clip_varb, stb) # newVal = newVal + epsilon * q_v # newValB = newValB + epsilon * q_vb else: tern_var = pi[i, j] * var_list[ 2 * (j + self.nb_agents * k)] - bi[ i, j] * lamb1 * grad_list[2 * (j + self.nb_agents * k)] tern_varb = pi[i, j] * var_list[ 2 * (j + self.nb_agents * k) + 1] - bi[i, j] * lamb2 * grad_list[ 2 * (j + self.nb_agents * k) + 1] clip_var = self.clip(tern_var, self.clipSTD) clip_varb = self.clip(tern_varb, self.clipSTD) # st = tf.math.maximum(st, self.getST(clip_var)) # stb = tf.math.maximum(stb, self.getST(clip_varb)) # tf.print(st) # st = st * self.stMultiplier # stb = stb * self.stMultiplier q_v = self.tern(clip_var, st * self.stMultiplier) q_vb = self.tern(clip_varb, stb * self.stMultiplier) newVal = newVal + epsilon * q_v newValB = newValB + epsilon * q_vb val_list[2 * (i + self.nb_agents * k)] = newVal val_list[2 * (i + self.nb_agents * k) + 1] = newValB return zip(grad_list, var_list, processor, val_list) def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This is the second part of `minimize()`. It returns an `Operation` that applies gradients. Args: grads_and_vars: List of (gradient, variable) pairs as returned by `compute_gradients()`. global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the `Optimizer` constructor. Returns: An `Operation` that applies the specified gradients. If `global_step` was not None, that operation also increments `global_step`. Raises: TypeError: If `grads_and_vars` is malformed. ValueError: If none of the variables have gradients. RuntimeError: If you should use `_distributed_apply()` instead. """ # This is a default implementation of apply_gradients() that can be shared # by most optimizers. It relies on the subclass implementing the following # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse(). # TODO(isaprykin): Get rid of `has_strategy()` check by # always calling _distributed_apply(), using the default distribution # as needed. self.epochStart = self.params.epochStart self.params.epochStart = False if distribute_ctx.has_strategy(): # Handle DistributionStrategy case. if distribute_ctx.in_cross_replica_context(): raise RuntimeError( "Use `_distributed_apply()` instead of " "`apply_gradients()` in a cross-replica context.") grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)() return distribute_ctx.get_replica_context().merge_call( self._distributed_apply, args=(grads_and_vars, global_step, name)) # No DistributionStrategy case. grads_and_vars = tuple( grads_and_vars) # Make sure repeat iteration works. if not grads_and_vars: raise ValueError("No variables provided.") converted_grads_and_vars = [] for g, v in grads_and_vars: if g is not None: try: # Convert the grad to Tensor or IndexedSlices if necessary. g = ops.convert_to_tensor_or_indexed_slices(g) except TypeError: raise TypeError("Gradient must be convertible to a Tensor" " or IndexedSlices, or None: %s" % g) if not isinstance(g, (ops.Tensor, ops.IndexedSlices)): raise TypeError( "Gradient must be a Tensor, IndexedSlices, or None: %s" % g) p = _get_processor(v) converted_grads_and_vars.append((g, v, p)) converted_grads_and_vars = tuple(converted_grads_and_vars) grad_list = [ g for g, v, _ in converted_grads_and_vars if g is not None ] var_list = [v for g, v, _ in converted_grads_and_vars if g is not None] if not var_list: raise ValueError("No gradients provided for any variable: %s." % ([str(v) for _, v, _ in converted_grads_and_vars], )) with ops.init_scope(): self._create_slots(var_list) # Imp function compGV = self.CDGrads(converted_grads_and_vars, global_step) update_ops = [] with ops.name_scope(name, self._name, skip_on_eager=False) as name: self._prepare() for grad, var, processor, val in compGV: if grad is None: continue # We colocate all ops created in _apply_dense or _apply_sparse # on the same device as the variable. # TODO(apassos): figure out how to get the variable name here. if (context.executing_eagerly() or resource_variable_ops.is_resource_variable(var) and not var._in_graph_mode): # pylint: disable=protected-access scope_name = "" else: scope_name = var.op.name with ops.name_scope( "update_" + scope_name, skip_on_eager=False), ops.colocate_with(var): if self.epochStart and False: update_ops.append(self.set_slot(var, "epoch_var", val)) update_ops.append(var.assign(val)) if global_step is None: apply_updates = self._finish(update_ops, name) else: with ops.control_dependencies( [self._finish(update_ops, "update")]): with ops.colocate_with(global_step): if isinstance( global_step, resource_variable_ops.BaseResourceVariable): apply_updates = resource_variable_ops.assign_add_variable_op( global_step.handle, ops.convert_to_tensor(1, dtype=global_step.dtype), name=name) else: apply_updates = state_ops.assign_add(global_step, 1, name=name) if not context.executing_eagerly(): if isinstance(apply_updates, ops.Tensor): apply_updates = apply_updates.op train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) if apply_updates not in train_op: train_op.append(apply_updates) if self.epochStart: self.epochStart = False return apply_updates def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype return training_ops.resource_apply_gradient_descent( var.handle, 1.0, grad, use_locking=self._use_locking) def get_config(self): config = super(SGD, self).get_config() config.update({ "learning_rate": self._serialize_hyperparameter("learning_rate"), "decay": self._serialize_hyperparameter("decay"), "momentum": self._serialize_hyperparameter("momentum"), "nesterov": self.nesterov, }) return config