def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 # Applies bounds on actual learning rate step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr / self.base_lr lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.)) upper_bound = final_lr * (1. + 1. / (self.gamma * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsbound: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # apply weight decay if self.weight_decay != 0.: g += self.weight_decay * K.stop_gradient(p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsbound: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) # Compute the bounds step_size_p = step_size * K.ones_like(denom) step_size_p_bound = step_size_p / denom bounded_lr_t = m_t * K.minimum( K.maximum(step_size_p_bound, lower_bound), upper_bound) p_t = p - bounded_lr_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [ms] + vs self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) for p, g, m, v in zip(params, grads, ms, vs): # update accumulator new_v = self.rho * v + ( 1. - self.rho) * self.rho * math_ops.square(g - m) new_m = self.rho * m + (1. - self.rho) * g self.updates.append(state_ops.assign(m, new_m)) self.updates.append(state_ops.assign(v, new_v)) new_p = p - lr * g / (K.sqrt(new_v) + self.epsilon) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # Learning rate multipliers if self.multipliers: multiplier = [ mult for mult in self.multipliers if mult in p.name ] else: multiplier = None if multiplier: new_lr_t = lr_t * self.multipliers[multiplier[0]] if self.debug_verbose: print('Setting {} to learning rate {}'.format( multiplier[0], new_lr_t)) print(K.get_value(new_lr_t)) else: new_lr_t = lr_t if self.debug_verbose: print('No change in learning rate {}'.format(p.name)) print(K.get_value(new_lr_t)) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - new_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - new_lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def __call__(self, w): # get center coordinates shape = w.get_shape().as_list() assert (shape[0] % 2 == 1) assert (shape[1] % 2 == 1) centerX_ = (shape[0] - 1) / 2 centerY_ = (shape[1] - 1) / 2 centerX = tf.cast(centerX_, dtype=tf.int64) centerY = tf.cast(centerY_, dtype=tf.int64) # get impulse tensor which has same center value with w and other values are 0 centerValue = w[centerX, centerY] impulse = K.zeros(shape) impulse = impulse[centerX, centerY].assign(centerValue) # get impulse tensor which has center value is -1 and other values are 0 minus_ones = self.center * tf.constant(np.ones(shape), dtype=tf.float32) impulse_ = K.zeros(shape) impulse_ = impulse_[centerX, centerY].assign(minus_ones[centerX, centerY]) # set center value to zero w -= impulse # normalize w /= K.sum(w, axis=self.axis) / self.sum # set center value to -1 w += impulse_ return w
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) shapes = [K.int_shape(p) for p in params] accumulators = [K.zeros(shape) for shape in shapes] delta_accumulators = [K.zeros(shape) for shape in shapes] self.weights = accumulators + delta_accumulators self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): # update accumulator new_a = self.rho * a + (1. - self.rho) * math_ops.square(g) self.updates.append(state_ops.assign(a, new_a)) # use the new accumulator and the *old* delta_accumulator update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) new_p = p - lr * update # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update) self.updates.append(state_ops.assign(d_a, new_d_a)) return self.updates
def attention_layer(X, n_h, Ty): """ Creates an attention layer. Input: X - Layer input (m, Tx, x_vocab_size) n_h - Size of LSTM hidden layer Ty - Timesteps in output sequence Output: output - The output of the attention layer (m, Tx, n_h) """ # Define the default state for the LSTM layer h = Lambda(lambda X: K.zeros(shape=(K.shape(X)[0], n_h)), name='h_attention_layer')(X) c = Lambda(lambda X: K.zeros(shape=(K.shape(X)[0], n_h)), name='c_attention_layer')(X) # Messy, but the alternative is using more Input() at_LSTM = LSTM(n_h, return_state=True, name='at_LSTM_attention_layer') output = [] # Run attention step and RNN for each output time step for _ in range(Ty): context = one_step_of_attention(h, X) h, _, c = at_LSTM(context, initial_state=[h, c]) output.append(h) return output
def build(self, input_shape): #assert len(input_shape) == 2 input_dim = input_shape[-1] self.input_length = input_shape[1] self.W0 = self.init((input_dim, self.hidden), name='{}_W3'.format(self.name)) self.W = self.init((self.hidden, 1), name='{}_W'.format(self.name)) self.b0 = K.zeros((self.hidden, ), name='{}_b0'.format(self.name)) self.b = K.zeros((1, ), name='{}_b'.format(self.name)) self.trainable_weights = [self.W0, self.W, self.b, self.b0] self.regularizers = [] if self.W_regularizer: self.W_regularizer.set_param(self.W) self.regularizers.append(self.W_regularizer) if self.b_regularizer: self.b_regularizer.set_param(self.b) self.regularizers.append(self.b_regularizer) self.constraints = {} if self.W_constraint: self.constraints[self.W0] = self.W_constraint self.constraints[self.W] = self.W_constraint super(Attention, self).build(input_shape)
def _create_all_weights(self, params): shapes = [backend.int_shape(p) for p in params] ms = [backend.zeros(shape) for shape in shapes] vs = [backend.zeros(shape) for shape in shapes] self.weights = [self.iterations, self.m_schedule] + ms + vs return ms, vs
def reset_states(self): if not self.stateful: raise RuntimeError('Layer must be stateful.') input_shape = self.input_spec.shape output_shape = self._compute_output_shape(input_shape) if not input_shape[0]: raise ValueError('If a RNN is stateful, a complete ' 'input_shape must be provided ' '(including batch size). ' 'Got input shape: ' + str(input_shape)) if self.return_sequences: out_row, out_col, out_filter = output_shape[2:] else: out_row, out_col, out_filter = output_shape[1:] if hasattr(self, 'states'): K.set_value(self.states[0], np.zeros((input_shape[0], out_row, out_col, out_filter))) K.set_value(self.states[1], np.zeros((input_shape[0], out_row, out_col, out_filter))) else: self.states = [ K.zeros((input_shape[0], out_row, out_col, out_filter)), K.zeros( (input_shape[0], out_row, out_col, out_filter)) ]
def get_initial_state(self, inputs): initial_states = [] first = True if self._stackedcells: for cell in self.cell.cells: shape = list(cell.kernel_shape) shape[-1] = cell.filters if first: # Make m, h, c states initial_state = K.zeros_like(inputs) initial_state = K.sum(initial_state, axis=1) initial_state = cell.input_conv(initial_state, K.zeros(tuple(shape)), padding=cell.padding) initial_states += [initial_state for _ in range(3)] first = False else: # if not first make h, c states initial_state = K.zeros_like(initial_state) initial_state = cell.input_conv(initial_state, K.zeros(tuple(shape)), padding=cell.padding) initial_states += [initial_state for _ in range(2)] else: # Single cell shape = list(self.cell.kernel_shape) shape[-1] = self.cell.filters initial_state = K.zeros_like(inputs) initial_state = self.cell.inputs_conv(initial_state, K.zeros(tuple(shape)), padding=self.cell.padding) initial_states += [initial_state for _ in range(3)] return initial_states
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] shapes = [K.int_shape(p) for p in params] prev_grads = [ K.zeros(shape, name='prev_grad_' + str(i)) for (i, shape) in enumerate(shapes) ] ds = [ K.zeros(shape, name='d_' + str(i)) for (i, shape) in enumerate(shapes) ] vs = [ K.zeros(shape, name='v_' + str(i)) for (i, shape) in enumerate(shapes) ] self.weights = [self.iterations] + ds + vs + prev_grads for p, g, pg, v, d in zip(params, grads, prev_grads, vs, ds): v_t = self.momentum * v - self.lr * g self.updates.append(K.update(v, v_t)) d_t = self.momentum * d + (1 - self.momentum) * (g - pg) self.updates.append(K.update(d, d_t)) self.updates.append(K.update(pg, g)) new_p = p + v_t + self.kd * d_t self.updates.append(K.update(p, new_p)) return self.updates
def _create_all_weights(self, params): shapes = [backend.int_shape(p) for p in params] # zero init of 1st moment ms = [backend.zeros(shape) for shape in shapes] # zero init of exponentially weighted infinity norm us = [backend.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + us return ms, us
def _create_all_weights(self, params): ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats return ms, vs, vhats
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( 1. / (1. + self.decay * math_ops.cast(self.iterations,K.dtype(self.decay))) ) t = math_ops.cast(self.iterations, K.floatx()) + 1 # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = self.beta_1 * ( 1. - 0.5 * (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay))) momentum_cache_t_1 = self.beta_1 * ( 1. - 0.5 * (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay))) m_schedule_new = self.m_schedule * momentum_cache_t m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 self.updates.append((self.m_schedule, m_schedule_new)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # the following equations given in [1] g_prime = g / (1. - m_schedule_new) m_t = self.beta_1 * m + (1. - self.beta_1) * g m_t_prime = m_t / (1. - m_schedule_next) v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g) if self.amsgrad: vhat_t = math_ops.maximum(vhat, v_t) self.updates.append(state_ops.assign(vhat, vhat_t)) v_t_prime = vhat_t / (1. - math_ops.pow(self.beta_2, t)) else: v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t)) m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) p_t = p - lr * m_t_bar / (gen_math_ops.sqrt(v_t_prime) + self.epsilon) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) with ops.control_dependencies( [state_ops.assign_add(self.iterations, 1)]): t = math_ops.cast(self.iterations, K.floatx()) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats beta_1_power = math_ops.pow(self.beta_1, t) beta_2_power = math_ops.pow(self.beta_2, t) rho_t = self.rho_inf - 2.0 * t * beta_2_power / (1.0 - beta_2_power) lr_t = tf.where( rho_t >= 5.0, K.sqrt((rho_t - 4.) * (rho_t - 2.) * self.rho_inf / ((self.rho_inf - 4.) * (self.rho_inf - 2.) * rho_t)) * lr * (K.sqrt(1. - beta_2_power) / (1. - beta_1_power)), self.warmup_coef * lr / (1. - beta_1_power)) for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) if self.amsgrad: vhat_t = math_ops.maximum(vhat, v_t) p_t = p - lr_t * tf.where( rho_t >= 5.0, m_t / (K.sqrt(vhat_t) + self.epsilon), m_t) self.updates.append(state_ops.assign(vhat, vhat_t)) else: p_t = p - lr_t * tf.where(rho_t >= 5.0, m_t / (K.sqrt(v_t) + self.epsilon), m_t) self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates_ADAM(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] base_lr = self._optimizer.lr if self._optimizer._initial_decay > 0: base_lr = base_lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self._optimizer.decay * math_ops.cast(self._optimizer.iterations, K.dtype(self._optimizer.decay)))) with ops.control_dependencies( [state_ops.assign_add(self.iterations, 1)]): t = math_ops.cast(self._optimizer.iterations, K.floatx()) base_lr_t = base_lr * ( K.sqrt(1. - math_ops.pow(self._optimizer.beta_2, t)) / (1. - math_ops.pow(self._optimizer.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self._optimizer.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): if self._get_multiplier(p) is None: multiplier = 1.0 else: multiplier = self._get_multiplier(p) m_t = (self._optimizer.beta_1 * m) + (1. - self._optimizer.beta_1) * g v_t = (self._optimizer.beta_2 * v) + (1. - self._optimizer.beta_2) * math_ops.square(g) if self.amsgrad: vhat_t = math_ops.maximum(vhat, v_t) p_t = p - base_lr_t * multiplier * m_t / ( K.sqrt(vhat_t) + self._optimizer.epsilon) self.updates.append(state_ops.assign(vhat, vhat_t)) else: p_t = p - base_lr_t * multiplier * m_t / ( K.sqrt(v_t) + self._optimizer.epsilon) self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 '''Bias corrections according to the Adam paper ''' lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): #################################################### # Add a lr multiplier for vars outside excluded_vars if p.name in self.excluded_vars: multiplied_lr_t = lr_t else: multiplied_lr_t = lr_t * self.lr_mult ################################################### m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) '''Schedule multiplier eta_t = 1 for simple AdamW According to the AdamW paper, eta_t can be fixed, decay, or also be used for warm restarts (AdamWR to come). ''' eta_t = 1. p_t = p - eta_t * (multiplied_lr_t * m_t / (K.sqrt(v_t) + self.epsilon)) if self.weight_decay != 0: '''Normalized weight decay according to the AdamW paper ''' w_d = self.weight_decay * K.sqrt(self.batch_size / (self.samples_per_epoch * self.epochs)) p_t = p_t - eta_t * (w_d * p) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) t = math_ops.cast(self.iterations, K.floatx()) + 1 lr_t = lr * ( K.sqrt(1. - math_ops.pow(self.beta_2, t)) / (1. - math_ops.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): #################################################### # Add a lr multiplier for vars outside excluded_vars if p.name in self.excluded_vars: multiplied_lr_t = lr_t else: multiplied_lr_t = lr_t * self.lr_mult ################################################### m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) if self.amsgrad: vhat_t = math_ops.maximum(vhat, v_t) p_t = p - multiplied_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(state_ops.assign(vhat, vhat_t)) else: p_t = p - multiplied_lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_initial_states(self, inputs): # (samples, timesteps, rows, cols, filters) initial_state = K.zeros_like(inputs) # (samples, rows, cols, filters) initial_state = K.sum(initial_state, axis=1) depthwise_shape = list(self.depthwise_kernel_shape) pointwise_shape = list(self.pointwise_kernel_shape) initial_state = self.input_conv( initial_state, K.zeros(tuple(depthwise_shape)), K.zeros(tuple(pointwise_shape)), padding=self.padding) initial_states = [initial_state for _ in range(2)] return initial_states
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.initial_decay > 0: lr = lr * ( 1. / (1. + self.decay * math_ops.cast(self.iterations,K.dtype(self.decay))) ) with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): t = math_ops.cast(self.iterations, K.floatx()) lr_t = gen_math_ops.sqrt(1. - math_ops.pow(self.beta_2, t)) / (1. - math_ops.pow(self.beta_1, t)) lower_bound = self.lr_boost * (1. - 1. / (self.gamma * t + 1.)) upper_bound = self.lr_boost * (1. + 1. / (self.gamma * t)) if self.sgdcorr: m_rate = 1. - self.beta_1 / (self.gamma * t + 1.) else: m_rate = 1. - self.beta_1 ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + m_rate * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) if self.amsgrad: vhat_t = math_ops.maximum(vhat, v_t) lr_v = gen_math_ops.reciprocal(gen_math_ops.sqrt(vhat_t) + self.epsilon) self.updates.append(state_ops.assign(vhat, vhat_t)) else: lr_v = gen_math_ops.reciprocal(gen_math_ops.sqrt(v_t) + self.epsilon) lr_bound = gen_math_ops.minimum(gen_math_ops.maximum(lr_t * lr_v, lower_bound), upper_bound) p_t = p - lr * lr_bound * m_t self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] mg2 = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m, mg in zip(params, grads, moments, mg2): # monitor layer self.dictParam.append(self.iterations) v = self.momentum * m - lr * g # velocity # mg = K.zeros(g.shape) # test = g # p = state_ops.ops.Tensor(g, g.shape, dtype=tf.float32) self.dictParam.append(g) #mg2 = K.zeros(g.shape) #test2 = g# state_ops.assign(mg2, g) self.updates.append(state_ops.assign(m, v)) # self.dictParam[p.name]['before'] = [] # self.dictParam[p.name]['after'] = [] # self.dictParam[p.name]['before'].append(state_ops.assign(mg, test)) # self.dictParam[p.name]['after'].append(state_ops.assign(mg2, test2)) # monitor gradient # self.dictParam[p.name]['Grad'] = state_ops.assign(mg, g) if self.nesterov: new_p = p + self.momentum * v - lr * g # monitor new_p to p difference # self.dictParam[p.name]['Diff'] = state_ops.assign(mg, self.momentum * v - lr * g) else: # monitor new_p to p difference # self.dictParam[p.name]['Diff'] = v new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates_Padam(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] base_lr = self._optimizer.learning_rate if self.initial_decay > 0: base_lr = base_lr * (1. / (1. + self.decay * K.cast( self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = base_lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): if self._get_multiplier(p) is None: multiplier = 1.0 else: multiplier = self._get_multiplier(p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) # Partial momentum adaption. new_p = p - (lr_t * multiplier * (m_t / (denom**(self.partial * 2)))) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) with ops.control_dependencies( [state_ops.assign_add(self.iterations, 1)]): t = math_ops.cast(self.iterations, K.floatx()) lr_t = lr * (gen_math_ops.sqrt(1. - math_ops.pow(self.beta_2, t)) / (1. - math_ops.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + self.beta_g * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) if self.amsgrad: vhat_t = math_ops.maximum(vhat, v_t) p_t_ada = p - lr_t * m_t / (gen_math_ops.sqrt(vhat_t) + self.epsilon) self.updates.append(state_ops.assign(vhat, vhat_t)) else: p_t_ada = p - lr_t * m_t / (gen_math_ops.sqrt(v_t) + self.epsilon) p_t_sgd = p - self.lr_boost * lr * m_t self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) new_p = m_switch(self.switch_flag, p_t_sgd, p_t_ada) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def __init__(self, session): self.ref_img = img_to_array(load_img(P.doodle_target_mask_path)) self.img_nrows, self.img_ncols = self.ref_img.shape[:2] if K.image_data_format() == 'channels_first': shape = (1, P.num_colors, self.img_nrows, self.img_ncols) else: shape = (1, self.img_nrows, self.img_ncols, P.num_colors) self.style_image = K.variable(preprocess_image(P.doodle_style_img_path, self.img_nrows, self.img_ncols)) self.target_image = tf.placeholder(shape=shape, dtype=tf.float32) if P.use_content_img: self.content_image = K.variable(preprocess_image(P.doodle_content_img_path, self.img_nrows, self.img_ncols)) else: self.content_image = K.zeros(shape=shape) self.content_feature_layers = ['block5_conv2'] # To get better generation qualities, use more conv layers for style features self.style_feature_layers = ['block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1', 'block5_conv1'] self.session = session K.set_session(session) self.image_grad = None self.image_model = None self.mask_model = None self.optimizer = None self.merged = None self.write_summary = None
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) shapes = [K.int_shape(p) for p in params] accumulators = [K.zeros(shape) for shape in shapes] self.weights = accumulators self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) for p, g, a in zip(params, grads, accumulators): new_a = a + math_ops.square(g) # update accumulator self.updates.append(state_ops.assign(a, new_a)) new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def testConditionalMaskUpdate(self): weight = K.variable(np.linspace(1.0, 100.0, 100), name="weights") mask = K.ones(weight.get_shape()) threshold = K.zeros([]) def linear_sparsity(step): sparsity_val = ops.convert_to_tensor( [0.0, 0.1, 0.1, 0.3, 0.3, 0.5, 0.5, 0.5, 0.5, 0.5]) return ops.convert_to_tensor(True), sparsity_val[step] # Set up pruning p = pruning_impl.Pruning(pruning_vars=[(weight, mask, threshold)], training_step_fn=self.training_step_fn, pruning_schedule=linear_sparsity, block_size=self.block_size, block_pooling_type=self.block_pooling_type) non_zero_count = [] for _ in range(10): if context.executing_eagerly(): p.conditional_mask_update() p.weight_mask_op() state_ops.assign_add(self.global_step, 1) else: K.get_session().run(p.conditional_mask_update()) K.get_session().run(p.weight_mask_op()) K.get_session().run(state_ops.assign_add(self.global_step, 1)) non_zero_count.append(np.count_nonzero(K.get_value(weight))) # Weights pruned at steps 1,3,5 expected_non_zero_count = [100, 90, 90, 70, 70, 50, 50, 50, 50, 50] self.assertAllEqual(expected_non_zero_count, non_zero_count)
def _create_all_weights(self, params): accumulators = [ backend.zeros(backend.int_shape(p), dtype=backend.dtype(p)) for p in params ] self.weights = accumulators return accumulators
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) #calculate truncated step size global_grad_norm = tf.global_norm(grads)**2 #TODO verify this lr_trunc = tf.cond(0 < global_grad_norm, lambda: tf.divide(loss, global_grad_norm), lambda: lr) lr = tf.minimum(lr,lr_trunc) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): v = self.momentum * m - lr * g # velocity self.updates.append(state_ops.assign(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) shapes = [K.int_shape(p) for p in params] accumulators = [K.zeros(shape) for shape in shapes] self.weights = accumulators self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) #calculate the truncated-adagrad stepsize #global_grad_norm = tf.global_norm(grads)**2 #TODO verify this rotated_grad_norm = 0 for i in range(0,len(grads)): rotated_grad_norm += tf.reduce_sum(tf.multiply(tf.truediv(grads[i],tf.sqrt(accumulators[i])+ self.epsilon),grads[i])) #TODO verify this lr_trunc = tf.cond(0 < rotated_grad_norm, lambda: tf.divide(loss, rotated_grad_norm), lambda: lr) lr = tf.minimum(lr, lr_trunc) for p, g, a in zip(params, grads, accumulators): new_a = a + math_ops.square(g) # update accumulator self.updates.append(state_ops.assign(a, new_a)) new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): v = self.momentum * m - lr * g # velocity self.updates.append(state_ops.assign(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): t = math_ops.cast(self.iterations, K.floatx()) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = self.beta_1 * ( 1. - 0.5 * (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay))) momentum_cache_t_1 = self.beta_1 * ( 1. - 0.5 * (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay))) m_schedule_new = self.m_schedule * momentum_cache_t m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 self.updates.append((self.m_schedule, m_schedule_new)) shapes = [K.int_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations, self.m_schedule] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): # the following equations given in [1] g_prime = g / (1. - m_schedule_new) m_t = self.beta_1 * m + (1. - self.beta_1) * g m_t_prime = m_t / (1. - m_schedule_next) v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g) v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t)) m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): t = math_ops.cast(self.iterations, K.floatx()) lr_t = lr * ( K.sqrt(1. - math_ops.pow(self.beta_2, t)) / (1. - math_ops.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) if self.amsgrad: vhat_t = math_ops.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(state_ops.assign(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): t = math_ops.cast(self.iterations, K.floatx()) lr_t = lr / (1. - math_ops.pow(self.beta_1, t)) shapes = [K.int_shape(p) for p in params] # zero init of 1st moment ms = [K.zeros(shape) for shape in shapes] # zero init of exponentially weighted infinity norm us = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + us for p, g, m, u in zip(params, grads, ms, us): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g)) p_t = p - lr_t * m_t / (u_t + self.epsilon) self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(u, u_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def get_initial_state(self, inputs): # (samples, timesteps, rows, cols, filters) initial_state = K.zeros_like(inputs) # (samples, rows, cols, filters) initial_state = K.sum(initial_state, axis=1) shape = list(self.cell.kernel_shape) shape[-1] = self.cell.filters initial_state = self.cell.input_conv(initial_state, K.zeros(tuple(shape)), padding=self.cell.padding) if hasattr(self.cell.state_size, '__len__'): return [initial_state for _ in self.cell.state_size] else: return [initial_state]
def reset_states(self, states=None): if not self.stateful: raise AttributeError('Layer must be stateful.') input_shape = self.input_spec[0].shape state_shape = self.compute_output_shape(input_shape) if self.return_state: state_shape = state_shape[0] if self.return_sequences: state_shape = state_shape[:1].concatenate(state_shape[2:]) if None in state_shape: raise ValueError('If a RNN is stateful, it needs to know ' 'its batch size. Specify the batch size ' 'of your input tensors: \n' '- If using a Sequential model, ' 'specify the batch size by passing ' 'a `batch_input_shape` ' 'argument to your first layer.\n' '- If using the functional API, specify ' 'the time dimension by passing a ' '`batch_shape` argument to your Input layer.\n' 'The same thing goes for the number of rows and ' 'columns.') # helper function def get_tuple_shape(nb_channels): result = list(state_shape) if self.cell.data_format == 'channels_first': result[1] = nb_channels elif self.cell.data_format == 'channels_last': result[3] = nb_channels else: raise KeyError return tuple(result) # initialize state if None if self.states[0] is None: if hasattr(self.cell.state_size, '__len__'): self.states = [K.zeros(get_tuple_shape(dim)) for dim in self.cell.state_size] else: self.states = [K.zeros(get_tuple_shape(self.cell.state_size))] elif states is None: if hasattr(self.cell.state_size, '__len__'): for state, dim in zip(self.states, self.cell.state_size): K.set_value(state, np.zeros(get_tuple_shape(dim))) else: K.set_value(self.states[0], np.zeros(get_tuple_shape(self.cell.state_size))) else: if not isinstance(states, (list, tuple)): states = [states] if len(states) != len(self.states): raise ValueError('Layer ' + self.name + ' expects ' + str(len(self.states)) + ' states, ' + 'but it received ' + str(len(states)) + ' state values. Input received: ' + str(states)) for index, (value, state) in enumerate(zip(states, self.states)): if hasattr(self.cell.state_size, '__len__'): dim = self.cell.state_size[index] else: dim = self.cell.state_size if value.shape != get_tuple_shape(dim): raise ValueError('State ' + str(index) + ' is incompatible with layer ' + self.name + ': expected shape=' + str(get_tuple_shape(dim)) + ', found shape=' + str(value.shape)) # TODO(anjalisridhar): consider batch calls to `set_value`. K.set_value(state, value)