def predict(): vgg_model = VGG16(include_top=False, weights='imagenet') feature_model = Model(vgg_model.input, vgg_model.layers[-1].output) with open(os.path.join(BASE_DIR, "captionbot", "tokenizer.pkl"), 'rb') as f: tokenizer = pickle.load(f) encoder = Encoder(256) decoder = LocalAttentionDecoder(512, 5001, 256) optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') num_steps = 1 img_tensor = K.zeros((64, 49, 512)) target = K.zeros((64, 39)) batch_loss, t_loss = train_step(img_tensor, target, decoder, encoder, tokenizer, loss_object, optimizer) encoder.load_weights(os.path.join(BASE_DIR, "captionbot", "encoder2.hdf5")) decoder.load_weights(os.path.join(BASE_DIR, "captionbot", "decoder2.hdf5")) img_name = os.listdir(os.path.join(BASE_DIR, "media", "captionbot"))[0] img_path = os.path.join(BASE_DIR, "media", "captionbot", img_name) result = evaluate(img_path, decoder, encoder, tokenizer, feature_model) if result[-1] == '<end>': caption = "" for i in range(len(result) - 1): caption = caption + " " + result[i] else: caption = "" for i in range(len(result)): caption = caption + " " + result[i] print(caption) return caption
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): g2 = K.square(g) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = v - (1. - self.beta_2) * K.sign(v - g2) * g2 p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def build(self, input_shape): assert len(input_shape) >= 2 self.input_dim = input_shape[-1] self.kernel = self.add_weight(shape=(self.input_dim, self.units), initializer=self.kernel_initializer, name='kernel', regularizer=self.kernel_regularizer, constraint=self.kernel_constraint) self.sigma_kernel = self.add_weight( shape=(self.input_dim, self.units), initializer=initializers.Constant(value=self.sigma_init), name='sigma_kernel') if self.use_bias: self.bias = self.add_weight(shape=(self.units, ), initializer=self.bias_initializer, name='bias', regularizer=self.bias_regularizer, constraint=self.bias_constraint) self.sigma_bias = self.add_weight( shape=(self.units, ), initializer=initializers.Constant(value=self.sigma_init), name='sigma_bias') else: self.bias = None self.epsilon_bias = None self.epsilon_kernel = K.zeros(shape=(self.input_dim, self.units)) self.epsilon_bias = K.zeros(shape=(self.units, )) self.sample_noise() super(NoisyDense, self).build(input_shape)
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 # Applies bounds on actual learning rate step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr / self.base_lr lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.)) upper_bound = final_lr * (1. + 1. / (self.gamma * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsbound: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # apply weight decay if self.weight_decay != 0.: g += self.weight_decay * K.stop_gradient(p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsbound: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) # Compute the bounds step_size_p = step_size * K.ones_like(denom) step_size_p_bound = step_size_p / denom bounded_lr_t = m_t * K.minimum( K.maximum(step_size_p_bound, lower_bound), upper_bound) p_t = p - bounded_lr_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [(self.iterations, self.iterations + 1)] t = self.iterations + 1 lr_t = self.lr * K.sqrt(1. - K.pow(self.beta_2, t)) / ( 1. - K.pow(self.beta_1, t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] gs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = ms + vs flag = K.equal(t % self.accum_iters, 0) flag = K.cast(flag, dtype='float32') for p, g, m, v, gg in zip(params, grads, ms, vs, gs): gg_t = (1 - flag) * (gg + g) m_t = (self.beta_1 * m) + (1. - self.beta_1) * (gg + flag * g) / self.accum_iters v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square( (gg + flag * g) / self.accum_iters) p_t = p - flag * lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append((m, flag * m_t + (1 - flag) * m)) self.updates.append((v, flag * v_t + (1 - flag) * v)) self.updates.append((gg, gg_t)) # apply constraints. new_p = p_t if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr adam_lr = self.adam_lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) adam_lr = adam_lr * (1. / (1. + self.decay * K.cast( self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 adam_lr_t = adam_lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.ms = K.zeros(K.int_shape(params[0]), dtype=K.dtype(params[0])) self.vs = K.zeros(K.int_shape(params[0]), dtype=K.dtype(params[0])) self.weights = [self.iterations] + moments + vhats + [self.ms ] + [self.vs] for i, (p, g, m, vhat) in enumerate(zip(params, grads, moments, vhats)): v = self.momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v if i == 0 and self.e2efs_layer is not None: nnz = K.sum(K.cast(K.greater(p, 0.), K.floatx())) m_t = (self.beta_1 * self.ms) + (1. - self.beta_1) * g v_t = (self.beta_2 * self.vs) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - adam_lr_t * m_t / (K.sqrt(vhat_t) + K.epsilon()) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - adam_lr_t * m_t / (K.sqrt(v_t) + K.epsilon()) self.updates.append(K.update(self.ms, m_t)) self.updates.append(K.update(self.vs, v_t)) new_p = K.switch(K.less_equal(nnz, self.e2efs_layer.units), new_p, p_t) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # Learning rate multipliers if self.multipliers: multiplier = [ mult for mult in self.multipliers if mult in p.name ] else: multiplier = None if multiplier: new_lr_t = lr_t * self.multipliers[multiplier[0]] if self.debug_verbose: print('Setting {} to learning rate {}'.format( multiplier[0], new_lr_t)) print(K.get_value(new_lr_t)) else: new_lr_t = lr_t if self.debug_verbose: print('No change in learning rate {}'.format(p.name)) print(K.get_value(new_lr_t)) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - new_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - new_lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def update_1(self, params): """ First update on CPU or GPU """ ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] return ms, vs, vhats
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [state_ops.assign_add(self.iterations, 1)] t = math_ops.cast(self.iterations, K.floatx()) + 1 lr_t = self.lr if self.initial_decay > 0: lr_t = lr_t * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) lr_t = lr_t * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr_t / self.base_lr lower_bound = final_lr * (1 - 1 / ((1-self.beta_2) * (t + 1))) upper_bound = final_lr * (1 + 1 / ((1-self.beta_2) * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsbound: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) if self.weight_decay != 0.: g += self.weight_decay * K.stop_gradient(p) if self.amsbound: vhat_t = K.maximum(vhat, v_t) denom = K.sqrt(vhat_t) + self.epsilon self.updates.append(state_ops.assign(vhat, vhat_t)) else: denom = K.sqrt(v_t) + self.epsilon eta_hat = tf.clip_by_value(lr_t/denom, lower_bound, upper_bound) # eta = eta_hat / K.sqrt(t) p_t = p - m_t * eta_hat self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def _update_1(self, params): """ Perform the first update. Run under CPU context if running on Tensorflow and CPU mode is enabled, otherwise run on the default device. """ ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] return ms, vs, vhats
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 '''Bias corrections according to the Adam paper ''' lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): #################################################### # Add a lr multiplier for vars outside excluded_vars if p.name in self.excluded_vars: multiplied_lr_t = lr_t else: multiplied_lr_t = lr_t * self.lr_mult ################################################### m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) '''Schedule multiplier eta_t = 1 for simple AdamW According to the AdamW paper, eta_t can be fixed, decay, or also be used for warm restarts (AdamWR to come). ''' eta_t = 1. p_t = p - eta_t * (multiplied_lr_t * m_t / (K.sqrt(v_t) + self.epsilon)) if self.weight_decay != 0: '''Normalized weight decay according to the AdamW paper ''' w_d = self.weight_decay * K.sqrt( self.batch_size / (self.samples_per_epoch * self.epochs)) p_t = p_t - eta_t * (w_d * p) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.weights = [self.iterations] lr = self.learning_rate for i, (p, g) in enumerate(zip(params, grads)): g2 = K.square(g) + self.epsilon1 shape, dtype = K.int_shape(p), K.dtype(p) factored_shape = self.factored_shape(shape) if factored_shape is None: # 定义参数 v = K.zeros(shape, dtype=dtype, name='v_' + str(i)) self.weights.append(v) # 定义更新 v_t = self.beta2 * v + (1.0 - self.beta2) * g2 self.updates.append(K.update(v, v_t)) else: # 定义参数 shape1, axis1, shape2, axis2 = factored_shape vr = K.zeros(shape1, dtype=dtype, name='vr_' + str(i)) vc = K.zeros(shape2, dtype=dtype, name='vc_' + str(i)) self.weights.extend([vr, vc]) # 定义更新 g2r = K.mean(g2, axis=axis1, keepdims=True) g2c = K.mean(g2, axis=axis2, keepdims=True) vr_t = self.beta2 * vr + (1.0 - self.beta2) * g2r vc_t = self.beta2 * vc + (1.0 - self.beta2) * g2c self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)]) # 合成矩阵 v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True) # 增量主体 u = g / K.sqrt(v_t) # 增量裁剪 if self.clipping_threshold is not None: u_rms = self.rms(u) d = self.clipping_threshold u = u / K.maximum(1.0, u_rms / d) # 增量滑动 if self.beta1 > 0.0: # 定义参数 m = K.zeros(shape, dtype=dtype, name='m_' + str(i)) self.weights.append(m) # 定义更新 m_t = self.beta1 * m + (1.0 - self.beta1) * u self.updates.append(K.update(m, m_t)) u = m_t # 增量调整 if self.multiply_by_parameter_scale: u = u * K.maximum(self.rms(p), self.epsilon2) # 更新参数 self.updates.append(K.update(p, p - lr * u)) return self.updates
def rel_to_abs(self, x): shape = K.shape(x) shape = [shape[i] for i in range(3)] B, Nh, L, = shape col_pad = K.zeros(K.stack([B, Nh, L, 1])) x = K.concatenate([x, col_pad], axis=3) flat_x = K.reshape(x, [B, Nh, L * 2 * L]) flat_pad = K.zeros(K.stack([B, Nh, L - 1])) flat_x_padded = K.concatenate([flat_x, flat_pad], axis=2) final_x = K.reshape(flat_x_padded, [B, Nh, L + 1, 2 * L - 1]) final_x = final_x[:, :, :L, L - 1:] return final_x
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) t = math_ops.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - math_ops.pow(self.beta_2, t)) / (1. - math_ops.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): #################################################### # Add a lr multiplier for vars outside excluded_vars if p.name in self.excluded_vars: multiplied_lr_t = lr_t else: multiplied_lr_t = lr_t * self.lr_mult ################################################### m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) if self.amsgrad: vhat_t = math_ops.maximum(vhat, v_t) p_t = p - multiplied_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(state_ops.assign(vhat, vhat_t)) else: p_t = p - multiplied_lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def call(self, inputs): initial_states = [ K.zeros((K.shape(inputs)[0], self.units)), K.zeros((K.shape(inputs)[0], self.units)) ] # 定义初始态(全零) outputs = K.rnn(self.one_step, inputs, initial_states) self.distance = 1 - K.mean( outputs[1][..., self.units:self.units + self.levels], -1) self.distance_in = K.mean(outputs[1][..., self.units + self.levels:], -1) if self.return_sequences: return outputs[1][..., :self.units] else: return outputs[0][..., :self.units]
def get_updates(self, loss, params): # Mostly the same code as Adam class, with added multiplier variables. # Keras code from: # https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/optimizers.py#L456 grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( 1.0 / (1.0 + self.decay * K.cast(self.iterations, K.dtype(self.decay))) ) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * ( K.sqrt(1.0 - K.pow(self.beta_2, t)) / (1.0 - K.pow(self.beta_1, t)) ) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): layername = p.name.split("/", 1)[0] mult = self.multipliers.get(layername, 1.0) m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g v_t = (self.beta_2 * v) + (1.0 - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - mult * lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - mult * lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, "constraint", None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): t = math_ops.cast(self.iterations, K.floatx()) lr_t = lr * ( K.sqrt(1. - math_ops.pow(self.beta_2, t)) / (1. - math_ops.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] ss = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] # if self.amsgrad is True: # vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] # else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + ss + vhats for p, g, m, v, s, vhat in zip(params, grads, ms, vs, ss, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) # if self.amsgrad is True: # vhat_t = math_ops.maximum(vhat, v_t) # miu_t = lr_t / (K.sqrt(vhat_t) + self.epsilon) # p_t = p - miu_t * m_t # self.updates.append(state_ops.assign(vhat, vhat_t)) # else: miu_t = lr_t / (K.sqrt(v_t) + self.epsilon) s_t = self.beta_3 * s + (1 - self.beta_3) * miu_t miu_t_hat = math_ops.minimum(miu_t, s_t) p_t = p - miu_t_hat * m_t self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) self.updates.append(state_ops.assign(s, s_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def updated_get_updates(self, loss, params): self.accumulate_gradient_accumulators = [ K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params ] if ema_decay > 0: self.params_for_ema_tracking = params self.params_ema = [ K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params ] updates_accumulated_iterations = K.update_add( self.accumulated_iterations, 1) new_grads = orig_get_gradients(loss, params) if not accumulate_sum_or_mean: new_grads = [ g / K.cast(self.update_params_frequency, K.dtype(g)) for g in new_grads ] self.updated_grads = [ K.update_add(p, g) for p, g in zip(self.accumulate_gradient_accumulators, new_grads) ] def update_function(): with tensorflow.control_dependencies(orig_get_updates( loss, params)): reset_grads = [ K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p))) for p in self.accumulate_gradient_accumulators ] if ema_decay > 0: reset_grads += [K.update_add(self.total_iterations, 1)] reset_grads += [ K.update(e_p, (e_p * ema_decay) + (1 - ema_decay) * p) for e_p, p in zip(self.params_ema, params) ] return tensorflow.group(*(reset_grads + [updates_accumulated_iterations])) def just_store_function(): return tensorflow.group(*[updates_accumulated_iterations]) update_switch = K.equal( (updates_accumulated_iterations) % self.update_params_frequency, 0) with tensorflow.control_dependencies(self.updated_grads): self.updates = [ K.switch(update_switch, update_function, just_store_function) ] return self.updates
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) shapes = [K.shape(p) for p in params] alphas = [ K.variable(K.ones(shape) * self.init_alpha) for shape in shapes ] old_grads = [K.zeros(shape) for shape in shapes] prev_weight_deltas = [K.zeros(shape) for shape in shapes] self.weights = alphas + old_grads self.updates = [] for param, grad, old_grad, prev_weight_delta, alpha in zip( params, grads, old_grads, prev_weight_deltas, alphas): # equation 4 new_alpha = K.switch( K.greater(grad * old_grad, 0), K.minimum(alpha * self.scale_up, self.max_alpha), K.switch(K.less(grad * old_grad, 0), K.maximum(alpha * self.scale_down, self.min_alpha), alpha)) # equation 5 new_delta = K.switch( K.greater(grad, 0), -new_alpha, K.switch(K.less(grad, 0), new_alpha, K.zeros_like(new_alpha))) # equation 7 weight_delta = K.switch(K.less(grad * old_grad, 0), -prev_weight_delta, new_delta) # equation 6 new_param = param + weight_delta # reset gradient_{t-1} to 0 if gradient sign changed (so that we do # not "double punish", see paragraph after equation 7) grad = K.switch(K.less(grad * old_grad, 0), K.zeros_like(grad), grad) # Apply constraints #if param in constraints: # c = constraints[param] # new_param = c(new_param) self.updates.append(K.update(param, new_param)) self.updates.append(K.update(alpha, new_alpha)) self.updates.append(K.update(old_grad, grad)) self.updates.append(K.update(prev_weight_delta, weight_delta)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) m_t_hat = m_t / (1. - K.pow(self.beta_1, t)) v_t_hat = v_t / (1. - K.pow(self.beta_2, t)) p_dash = m_t_hat / (K.sqrt(v_t_hat + self.epsilon)) if self.weight_decay > 0.: wd = self.weight_decay * p p_dash = p_dash + wd r1 = K.sqrt(K.sum(K.square(p))) r2 = K.sqrt(K.sum(K.square(p_dash))) r = tf.where(tf.greater(r1, 0.), tf.where(tf.greater(r2, 0.), r1 / r2, 1.0), 1.0) # r = r1 / r2 eta = r * lr p_t = p - eta * p_dash self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.inital_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) t = self.iterations + 1 lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / ( 1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] f = K.variable(0) d = K.variable(1) self.weights = [self.iterations] + ms + vs + [f, d] cond = K.greater(t, K.variable(1)) small_delta_t = K.switch(K.greater(loss, f), self.small_k + 1, 1. / (self.big_K + 1)) big_delta_t = K.switch(K.greater(loss, f), self.big_K + 1, 1. / (self.small_k + 1)) c_t = K.minimum(K.maximum(small_delta_t, loss / (f + self.epsilon)), big_delta_t) f_t = c_t * f r_t = K.abs(f_t - f) / (K.minimum(f_t, f)) d_t = self.beta_3 * d + (1 - self.beta_3) * r_t f_t = K.switch(cond, f_t, loss) d_t = K.switch(cond, d_t, K.variable(1.)) self.updates.append(K.update(f, f_t)) self.updates.append(K.update(d, d_t)) for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (d_t * K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) clptrkey = set_pattern_find(p.name,self.clips.keys()) if self.clips_val and clptrkey: c = K.eval(self.clips[clptrkey]) if self.verbose>0: print("Clipping variable",p.name," to ", c ) new_p = K.clip(new_p, c[0], c[1]) self.updates.append(K.update(p, new_p)) return self.updates
def call(self, x): truncate = 4.0 kernel_size = K.round(truncate * self.sigma[0] + 0.5) dim = K.eval(kernel_size) tensor_range = K.arange(dim[0], dtype='float32') x_distance = K.concatenate( [-1 * tensor_range, K.zeros(1), tensor_range]) x_shape = K.int_shape(x_distance) y_distance = K.reshape(x_distance, (1, x_shape[0])) result = (K.square(x_distance) + K.square(y_distance)) / (2.0 * K.square(self.sigma[0])) result = K.exp(-1 * result) result = result / K.sum(result) result = K.expand_dims(result, axis=-1) result = K.repeat_elements(result, 3, axis=-1) result = K.expand_dims(result) conv_result = tf.nn.depthwise_conv2d(x, result, (1, 1, 1, 1), padding='SAME') return x + (self.amount[0] * (x - conv_result))
def update_function(): with tf.control_dependencies(orig_get_updates(loss, params)): reset_grads = [ K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p))) for p in self.accumulated_grads ] return tf.group(*(reset_grads + [iters]))
def new_get_updates(self, loss, params): self.accumulated_grads = [ K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params ] iters = K.update_add(self.accumulated_iters, 1) new_grads = orig_get_gradients(loss, params) if do_mean: new_grads = [g / K.cast(iter_size, K.dtype(g)) for g in new_grads] self.updated_grads = [ K.update_add(p, g) for p, g in zip(self.accumulated_grads, new_grads) ] def update_function(): with tf.control_dependencies(orig_get_updates(loss, params)): reset_grads = [ K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p))) for p in self.accumulated_grads ] return tf.group(*(reset_grads + [iters])) def just_store_function(): return tf.group(*[iters]) update_switch = K.equal(iters % iter_size, 0) with tf.control_dependencies(self.updated_grads): self.updates = [ K.switch(update_switch, update_function, just_store_function) ] return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): matched_layer = [x for x in self.lr_multipliers.keys() if x in p.name] if matched_layer: new_lr = lr * self.lr_multipliers[matched_layer[0]] else: new_lr = lr v = self.momentum * m - new_lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - new_lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def context_enhancement_module(x1, x2, x3, size, name='cem_block'): x1 = conv1x1(x1, in_channels=x1.shape[3], out_channels=245, strides=1, groups=1, use_bias=True, name='{}/c4_lat'.format(name)) x2 = nn.Lambda(lambda img: tf.image.resize_bilinear( img, [20, 20], align_corners=True, name='{}/c5_resize'.format(name)))( x2) x2 = conv1x1(x2, in_channels=x2.shape[3], out_channels=245, strides=1, groups=1, use_bias=True, name='{}/c5_lat'.format(name)) zero = K.zeros((1, size, size, 528)) x3 = nn.Lambda(lambda img: nn.add([img, zero]))(x3) x3 = conv1x1(x3, in_channels=x3.shape[3], out_channels=245, strides=1, groups=1, use_bias=True, name='{}/c_glb_lat'.format(name)) print(x1) return nn.add([x1, x2, x3])
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = accumulators self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) for p, g, a in zip(params, grads, accumulators): # update accumulator new_a = self.rho * a + (1. - self.rho) * K.square(g) self.updates.append(K.update(a, new_a)) new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) clptrkey = set_pattern_find(p.name,self.clips.keys()) if self.clips_val and clptrkey: if self.verbose>0: print("CLpping variable",p.name," to ", self.clips[p.name] ) c = K.eval(self.clips[clptrkey]) new_p = K.clip(new_p, c[0], c[1]) self.updates.append(K.update(p, new_p)) return self.updates
def test_DSSIM_channels_first(): prev_data = K.image_data_format() K.set_image_data_format('channels_first') for input_dim, kernel_size in zip([32, 33], [2, 3]): input_shape = [3, input_dim, input_dim] X = np.random.random_sample(4 * input_dim * input_dim * 3) X = X.reshape([4] + input_shape) y = np.random.random_sample(4 * input_dim * input_dim * 3) y = y.reshape([4] + input_shape) model = Sequential() model.add(Conv2D(32, (3, 3), padding='same', input_shape=input_shape, activation='relu')) model.add(Conv2D(3, (3, 3), padding='same', input_shape=input_shape, activation='relu')) adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8) model.compile(loss=DSSIMObjective(kernel_size=kernel_size), metrics=['mse'], optimizer=adam) model.fit(X, y, batch_size=2, epochs=1, shuffle='batch') # Test same x1 = K.constant(X, 'float32') x2 = K.constant(X, 'float32') dssim = DSSIMObjective(kernel_size=kernel_size) assert_allclose(0.0, K.eval(dssim(x1, x2)), atol=1e-4) # Test opposite x1 = K.zeros([4] + input_shape) x2 = K.ones([4] + input_shape) dssim = DSSIMObjective(kernel_size=kernel_size) assert_allclose(0.5, K.eval(dssim(x1, x2)), atol=1e-4) K.set_image_data_format(prev_data)
def new_get_updates(self, loss, params): self.accumulated_grads = [ K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params ] update_iter = [K.update_add(self.accumulated_iterations, 1)] new_grads = self.orig_get_gradients(loss, params) if self.do_mean: new_grads = [ g / K.cast(self.iter_size, K.dtype(g)) for g in new_grads ] update_grads = [ K.update_add(p, g) for p, g in zip(self.accumulated_grads, new_grads) ] def update_func(): with tf.control_dependencies(self.orig_get_updates(loss, params)): reset_grads = [ K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p))) for p in self.accumulated_grads ] return tf.group(*(reset_grads + update_iter)) def just_iter_func(): return tf.group(*update_iter) # do the original get_updates() computations only once every # 'iter_size' iterations update_switch = K.equal(self.accumulated_iterations % self.iter_size, 0) with tf.control_dependencies(update_grads): self.updates = [ K.switch(update_switch, update_func, just_iter_func) ] return self.updates