def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params) ] vs = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params) ] if self.amsgrad: vhats = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params) ] else: vhats = [ K.zeros(1, name='vhat_' + str(i)) for i in range(len(params)) ] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g - m_t) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): # 是否更新 cond = K.equal(self.iterations % self.grad_accum_steps, 0) cond = K.cast(cond, K.floatx()) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros(shape=K.int_shape(p), dtype=K.dtype(p), name='accum_grad_{}'.format(i)) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = cond * new_x + (1 - cond) * x return old_update(x, new_x) K.update = new_update updates = super(NewOptimizer, self).get_updates(loss, params) K.update = old_update # 累计更新 with K.control_dependencies(updates): acc_updates = [ K.update(ag, g + (1 - cond) * ag) for ag, g in zip(self.accum_grads, grads) ] return acc_updates
def dense_loss(self, y_true, y_pred): """y_true需要是one hot形式 """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True) mask = K.cast(mask, K.floatx()) # 计算目标分数 y_true, y_pred = y_true * mask, y_pred * mask target_score = self.path_score(y_pred, y_true) # 递归计算log Z init_states = [y_pred[:, 0]] y_pred = K.concatenate([y_pred, mask], axis=2) input_length = K.int_shape(y_pred[:, 1:])[1] log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states, input_length=input_length) # 最后一步的log Z向量 log_norm = K.logsumexp(log_norm, 1) # logsumexp得标量 # 计算损失 -log p return log_norm - target_score
def call(self, x, mask=None): x0 = x if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) # x = x0 * mask if mask is not None else x0 x0 = Lambda(lambda x_: x_, output_shape=lambda s: s)(x0) # drop mask so do not put mask to conv1d x = self.conv1d(x0) x, g = x[:, :, :self.o_dim], x[:, :, self.o_dim:] if self.dropout_rate is not None: g = K.in_train_phase(K.dropout(g, self.dropout_rate), g) g = K.sigmoid(g) # mask is none mask = mask if mask is not None else K.ones_like(x) if self.skip_connection: if K.int_shape(x0)[-1] != self.o_dim: x0 = self.conv1d_1x1(x0) return (x0 * (1 - g) + x * g) * mask return x * g * mask
train_generator = data_generator(data=train_data, batch_size=batch_size) valid_generator = data_generator(data=valid_data, batch_size=batch_size) train_transfer_generator = data_generator(data=train_data, batch_size=batch_size, transfer=True, data_augmentation=True) # 加载预训练模型(3层) teacher = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=num_hidden_layers, model='bert') # 判别模型 x_in = Input(shape=K.int_shape(teacher.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classifier = Model(x_in, x) teacher_model = Model(teacher.inputs, classifier(teacher.output)) teacher_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) teacher_model.summary() class FastbertClassifierLayer(Layer):
# 加载预训练模型(12层) predecessor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, prefix='Predecessor-') # 加载预训练模型(3层) successor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=3, prefix='Successor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Dense(num_labels)(x_in) CRF = ConditionalRandomField(lr_multiplier=2) x = CRF(x) classifier = Model(x_in, x) opt = Adam(learning_rate=lr) predecessor_model = Model(predecessor.inputs, classifier(predecessor.outputs)) predecessor_model.compile( loss=predecessor_model.layers[-1].layers[-1].sparse_loss, optimizer=opt, metrics=[CRF.sparse_accuracy]) predecessor_model.summary()
def call(self, inputs, mask=None): if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) inputs = inputs - (1.0 - mask) * 1e12 return K.softmax(inputs, 1) # build model model = build_transformer_model( config_path, checkpoint_path, ) inputs = [ Input(shape=K.int_shape(model.inputs[0])[1:]), Input(shape=K.int_shape(model.inputs[1])[1:]) ] output = model(inputs) output = SinCosPositionEmbedding(K.int_shape(output)[-1])(output) output = Dropout(0.5)(output) output = Dense(384, activation='tanh')(output) att = AttentionPooling1D(name='attention_pooling_1')(output) output = ConcatSeq2Vec()([output, att]) output = DGCNN(dilation_rate=1, dropout_rate=0.1)(output) output = DGCNN(dilation_rate=2, dropout_rate=0.1)(output) output = DGCNN(dilation_rate=5, dropout_rate=0.1)(output)
def compute_output_shape(self, input_shape): if self._mode == 'embedding': return super(Embedding, self).compute_output_shape(input_shape) return input_shape[:2] + (K.int_shape(self.embeddings)[0], )
(f1, precision, recall, self.best_val_f1) ) f1, precision, recall = evaluate(self.model, test_data) print( 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' % (f1, precision, recall) ) teacher = build_transformer_model( config_path, checkpoint_path, return_keras_model=False ) x_in = Input(shape=K.int_shape(teacher.output)[1:]) x = Lambda(lambda x: x)(x_in) # softmax x = Dense(num_labels, activation='softmax')(x) teacher_classifier = Model(x_in, x) teacher_model = Model(teacher.input, teacher_classifier(teacher.output)) teacher_model.summary() teacher_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate), metrics=['sparse_categorical_accuracy'] ) student = build_transformer_model(
if f1 >= self.best_val_f1: self.best_val_f1 = f1 self.model.save_weights(self.model_name) print( 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % (f1, precision, recall, self.best_val_f1)) f1, precision, recall = evaluate(self.model, test_data) print('test: f1: %.5f, precision: %.5f, recall: %.5f\n' % (f1, precision, recall)) bert = build_transformer_model(config_path, checkpoint_path, return_keras_model=False) x_in = Input(shape=K.int_shape(bert.output)[1:]) x = Lambda(lambda x: x)(x_in) # softmax x = Dense(num_labels, activation='softmax')(x) bert_classifier = Model(x_in, x) teacher_model = Model(bert.input, bert_classifier(bert.output)) teacher_model.summary() teacher_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate), metrics=['sparse_categorical_accuracy']) if __name__ == '__main__': teacher_model_name = './best_teacher_model.weights' teacher_evaluator = Evaluator(teacher_model_name)