Ejemplo n.º 1
0
def main():
    seq_id, seq_O, seq_P, id_to_label, id_to_term = encode_seq(
        df_label=df_label, maxlen=MAX_LEN)

    class Evaluation(Callback):
        def __init__(self, val_data, interval=1):
            self.val_data = val_data
            self.interval = interval
            self.best_f1 = 0.

            self.true_vp_val = [
                (row["id"], row["OpinionTerms"], row["Polarities"],
                 row['O_start'], row['O_end']) for rowid, row in df_label[
                     df_label['id'].isin(self.val_data[0])].iterrows()
            ]

        def on_epoch_end(self, epoch, log={}):
            if epoch % self.interval == 0:
                o_out, p_out = pred_model.predict(
                    self.val_data[1:4], batch_size=BATCH_SIZE)  # CRF概率
                o_pred = np.argmax(o_out, axis=2)
                p_pred = np.argmax(p_out, axis=2)

                texts = [
                    df_review[df_review['id'] == i]["Reviews"].values[0]
                    for i in self.val_data[0]
                ]

                pred_vp_val = decode_seq(self.val_data[0], o_pred, p_pred,
                                         id_to_label, texts)

                precision, recall, f1 = cal_opinion_metrics(
                    pred_vp_val, self.true_vp_val)
                if f1 > self.best_f1:
                    self.best_f1 = f1
                    self.model.save_weights(
                        f'./model_op/op_model_0924_viteb.weights')
                    print(f'best = {f1}')

    tokenizer = BertTokenizer(token_dict)

    seq_input, seq_seg = bert_text_to_seq(list(df_review["Reviews"]),
                                          tokenizer,
                                          maxlen=MAX_LEN)

    true_vp = [(row["id"], row["OpinionTerms"], row["Polarities"],
                row['O_start'], row['O_end'])
               for rowid, row in df_label.iterrows()]

    pred_vp = decode_seq(seq_id, seq_O, seq_P, id_to_label,
                         list(df_review["Reviews"]))

    cal_opinion_metrics(pred_vp, true_vp)

    seq_O = to_categorical(seq_O)

    seq_P = to_categorical(seq_P)

    df_review['pos_tag'] = df_review['Reviews'].progress_apply(pos_tag)

    with open('./data/postag2id_0922_laptop_make_up.pkl', 'rb') as f:
        postag2id = pickle.load(f)

    df_review['pos_tag'] = df_review['pos_tag'].progress_apply(
        lambda postag: [postag2id[x] for x in postag])

    seq_postag = np.array(df_review['pos_tag'].values.tolist())

    view_train, view_val = split_viewpoints(seq_id, seq_input, seq_seg, seq_O,
                                            seq_P, seq_postag)

    print(view_val[0])
    print('------------------- 保存验证集的id ---------------------')
    print('保存final 验证集的val ids')

    # np.save('./data/final_makeup_laptop_val_ids', view_val[0])
    print('------------------- 保存完毕 ---------------------------')
    # exit()
    bert_model = load_trained_model_from_checkpoint(config_path,
                                                    checkpoint_path,
                                                    seq_len=None)
    for l in bert_model.layers:
        l.trainable = True

    x1_in = Input(shape=(MAX_LEN, ), name='x1_in')
    x2_in = Input(shape=(MAX_LEN, ), name='x2_in')
    o_in = Input(shape=(
        MAX_LEN,
        len(id_to_term) + 1,
    ), name='o_in')
    p_in = Input(shape=(
        MAX_LEN,
        len(id_to_label) + 1,
    ), name='p_in')

    pos_tag_in = Input(shape=(MAX_LEN, ), name='pos_tag_in')
    pos_tag_emb = Embedding(len(postag2id), POS_TAG_DIM,
                            trainable=True)(pos_tag_in)

    x = bert_model([x1_in, x2_in])
    x = Concatenate()([x, pos_tag_emb])

    p_out = Dense(len(id_to_label) + 1,
                  activation='softmax')(x)  # p_out 是极性的输出
    crf = CRF(len(id_to_term) + 1)
    o_out = crf(x)
    loss_seq_O = crf.loss_function(o_in, o_out)  # 直接加入 Lambda层后 计算图会出错
    loss_seq_O = Lambda(lambda x: K.mean(x))(loss_seq_O)
    # loss_seq_O = Lambda(lambda x: K.mean(categorical_crossentropy(x[0], x[1])), name='loss_seq_O')([o_in, o_out])

    loss_p = Lambda(lambda x: K.mean(categorical_crossentropy(x[0], x[1])),
                    name='loss_c')([p_in, p_out])

    train_model = Model([x1_in, x2_in, pos_tag_in, o_in, p_in], [o_out, p_out])
    pred_model = Model([x1_in, x2_in, pos_tag_in], [o_out, p_out])
    train_model._losses = []
    train_model._per_input_losses = {}
    train_model.add_loss(loss_seq_O)
    train_model.add_loss(loss_p)

    print(view_train[0].shape[0])

    total_steps, warmup_steps = calc_train_steps(
        num_example=view_train[0].shape[0],
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        warmup_proportion=0.1,
    )
    # optimizer = Adam(lr=1e-5)
    optimizer = AdamWarmup(total_steps, warmup_steps, lr=5e-5, min_lr=1e-6)

    train_model.compile(optimizer=optimizer)
    train_model.metrics_tensors.append(loss_seq_O)
    train_model.metrics_names.append('loss_seq_O')
    train_model.metrics_tensors.append(loss_p)
    train_model.metrics_names.append('loss_p')
    train_model.summary()

    eval_callback = Evaluation(val_data=view_val)

    train_model.fit(view_train[1:],
                    epochs=EPOCHS,
                    shuffle=True,
                    batch_size=BATCH_SIZE,
                    callbacks=[eval_callback])