def get_updates(self, loss, params): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='accum_grad_%s' % i) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = K.switch(cond, new_x, x) return old_update(x, new_x) K.update = new_update updates = super(new_optimizer, self).get_updates(loss, params) K.update = old_update # 累积梯度 with tf.control_dependencies(updates): accum_updates = [ K.update(ag, K.switch(cond, g, ag + g)) for g, ag in zip(grads, self.accum_grads) ] return accum_updates
def call(self, inputs): """如果custom_position_ids,那么第二个输入为自定义的位置id """ input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] if self.custom_position_ids: inputs, position_ids = inputs if K.dtype(position_ids) != 'int32': position_ids = K.cast(position_ids, 'int32') else: position_ids = K.arange(0, seq_len, dtype='int32')[None] if self.hierarchical: alpha = 0.4 if self.hierarchical is True else self.hierarchical embeddings = self.embeddings - alpha * self.embeddings[:1] embeddings = embeddings / (1 - alpha) embeddings_x = K.gather(embeddings, position_ids // self.input_dim) embeddings_y = K.gather(embeddings, position_ids % self.input_dim) pos_embeddings = alpha * embeddings_x + (1 - alpha) * embeddings_y else: if self.custom_position_ids: pos_embeddings = K.gather(self.embeddings, position_ids) else: pos_embeddings = self.embeddings[None, :seq_len] if self.merge_mode == 'add': return inputs + pos_embeddings elif self.merge_mode == 'mul': return inputs * pos_embeddings else: if not self.custom_position_ids: pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) return K.concatenate([inputs, pos_embeddings])
def call(self, inputs): """如果custom_position_ids,那么第二个输入为自定义的位置id """ if self.custom_position_ids: seq_len = K.shape(inputs)[1] inputs, position_ids = inputs if 'float' not in K.dtype(position_ids): position_ids = K.cast(position_ids, K.floatx()) else: input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] position_ids = K.arange(0, seq_len, dtype=K.floatx())[None] indices = K.arange(0, self.output_dim // 2, dtype=K.floatx()) indices = K.pow(10000.0, -2 * indices / self.output_dim) embeddings = tf.einsum('bn,d->bnd', position_ids, indices) embeddings = K.stack([K.sin(embeddings), K.cos(embeddings)], axis=-1) embeddings = K.reshape(embeddings, (-1, seq_len, self.output_dim)) if self.merge_mode == 'add': return inputs + embeddings elif self.merge_mode == 'mul': return inputs * (embeddings + 1.0) else: if not self.custom_position_ids: embeddings = K.tile(embeddings, [batch_size, 1, 1]) return K.concatenate([inputs, embeddings])
def get_updates(self, loss, params): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) cond = K.cast(cond, K.floatx()) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros( K.int_shape(p), dtype=K.dtype(p), name='accum_grad_%s' % i ) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = cond * new_x + (1 - cond) * x return old_update(x, new_x) K.update = new_update updates = super(NewOptimizer, self).get_updates(loss, params) K.update = old_update # 累积梯度 with tf.control_dependencies(updates): accum_updates = [ K.update(ag, g + (1 - cond) * ag) for g, ag in zip(grads, self.accum_grads) ] return accum_updates
def call(self, inputs): if isinstance(inputs, list): return [o for i in inputs for o in self.call(i)] outputs = [] batch_size = K.shape(inputs)[0] if np.ndim(self.parts) > 0: batch_size = K.cast(batch_size, 'float64') slices = [ K.cast(p * batch_size / sum(self.parts), 'int32') for p in np.cumsum(self.parts).astype('float64') ] else: stride = K.cast(tf.math.ceil(batch_size / self.parts), K.dtype(batch_size)) slices = [stride * (i + 1) for i in range(self.parts)] for i, _ in enumerate(slices): if i == 0: outputs.append(inputs[:slices[0]]) elif i == len(slices) - 1: outputs.append(inputs[slices[-2]:]) else: outputs.append(inputs[slices[i - 1]:slices[i]]) return outputs
def get_gradients(self, loss, params): accum_grads = [] for p in params: if p not in self.accum_grads: self.accum_grads[p] = K.zeros(K.int_shape(p), dtype=K.dtype(p)) accum_grads.append(self.accum_grads[p]) return [ag / self.grad_accum_steps for ag in accum_grads]
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.weights = [self.iterations] lr = self.learning_rate for i, (p, g) in enumerate(zip(params, grads)): g2 = K.square(g) + self.epsilon1 shape, dtype = K.int_shape(p), K.dtype(p) factored_shape = self.factored_shape(shape) if factored_shape is None: # 定义参数 v = K.zeros(shape, dtype=dtype, name='v_' + str(i)) self.weights.append(v) # 定义更新 v_t = self.beta2 * v + (1.0 - self.beta2) * g2 self.updates.append(K.update(v, v_t)) else: # 定义参数 shape1, axis1, shape2, axis2 = factored_shape vr = K.zeros(shape1, dtype=dtype, name='vr_' + str(i)) vc = K.zeros(shape2, dtype=dtype, name='vc_' + str(i)) self.weights.extend([vr, vc]) # 定义更新 vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True) vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True) self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)]) # 合成矩阵 v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True) # 增量主体 u = g / K.sqrt(v_t) # 增量裁剪 if self.clipping_threshold is not None: u_rms = K.mean(K.sum(K.square(u))) d = self.clipping_threshold u = u / K.maximum(1.0, u_rms / d) # 增量滑动 if self.beta1 > 0.0: # 定义参数 m = K.zeros(shape, dtype=dtype, name='m_' + str(i)) self.weights.append(m) # 定义更新 m_t = self.beta1 * m + (1.0 - self.beta1) * u self.updates.append(K.update(m, m_t)) u = m_t # 增量调整 if self.multiply_by_parameter_scale: u = u * K.maximum(K.mean(K.sum(K.square(p))), self.epsilon2) # 更新参数 self.updates.append(K.update(p, p - lr * u)) return self.updates
def get_updates(self, loss, params): # 获取梯度 grads = self.get_gradients(loss, params) # 定义累积梯度 self.accum_grads = [ K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params ] accum_updates = [ K.update(ag, K.switch(self.cond, ag * 0, ag + g)) for g, ag in zip(grads, self.accum_grads) ] # 继承原更新 self.optimizer.get_gradients = self.get_gradients super(GradientAccumulation, self).get_updates(loss, params) self.updates.extend(accum_updates) self.weights.extend(self.accum_grads) return self.updates
def call(self, inputs): """如果custom_position_ids,那么第二个输入为自定义的位置id """ if self.custom_position_ids: inputs, position_ids = inputs if K.dtype(position_ids) != 'int32': position_ids = K.cast(position_ids, 'int32') pos_embeddings = K.gather(self.embeddings, position_ids) else: input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] pos_embeddings = self.embeddings[:seq_len] pos_embeddings = K.expand_dims(pos_embeddings, 0) if self.merge_mode != 'add': pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) if self.merge_mode == 'add': return inputs + pos_embeddings else: return K.concatenate([inputs, pos_embeddings])
def get_updates(self, loss, params): updates = super(new_optimizer, self).get_updates(loss, params) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_vars = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='slow_var_%s' % i) for i, p in enumerate(params) ] with tf.control_dependencies(updates): slow_updates = [ K.update(q, K.switch(cond, q + alpha * (p - q), q)) for p, q in zip(params, slow_vars) ] with tf.control_dependencies(slow_updates): copy_updates = [ K.update(p, K.switch(cond, q, p)) for p, q in zip(params, slow_vars) ] return copy_updates
def call(self, inputs): if K.dtype(inputs) != 'int32': inputs = K.cast(inputs, 'int32') outputs = K.gather(self._embeddings, inputs) outputs = K.dot(outputs, self._project_kernel) return outputs