def get_updates(self, loss, params): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='accum_grad_%s' % i) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = K.switch(cond, new_x, x) return old_update(x, new_x) K.update = new_update updates = super(new_optimizer, self).get_updates(loss, params) K.update = old_update # 累积梯度 with tf.control_dependencies(updates): accum_updates = [ K.update(ag, K.switch(cond, g, ag + g)) for g, ag in zip(grads, self.accum_grads) ] return accum_updates
def call(self, inputs): if not hasattr(self, 'kernel'): embedding_layer = search_layer(inputs, self.embedding_name) if embedding_layer is None: raise Exception('Embedding layer not found') self.kernel = K.transpose(embedding_layer.embeddings) self.units = K.int_shape(self.kernel)[1] if self.use_bias: self.bias = self.add_weight(name='bias', shape=(self.units, ), initializer='zeros') outputs = K.dot(inputs, self.kernel) if self.use_bias: outputs = K.bias_add(outputs, self.bias) outputs = self.activation(outputs) return outputs
def get_updates(self, loss, params): updates = super(new_optimizer, self).get_updates(loss, params) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_vars = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='slow_var_%s' % i) for i, p in enumerate(params) ] with tf.control_dependencies(updates): slow_updates = [ K.update(q, K.switch(cond, q + alpha * (p - q), q)) for p, q in zip(params, slow_vars) ] with tf.control_dependencies(slow_updates): copy_updates = [ K.update(p, K.switch(cond, q, p)) for p, q in zip(params, slow_vars) ] return copy_updates