Exemple #1
0
def eval_datasets(grt_df, sys_df) -> Tuple[Metrics, Metrics, Metrics]:
    unlabelled_arg_counts = np.zeros(3, dtype=np.float32)
    labelled_arg_counts = np.zeros(3, dtype=np.float32)
    unlabelled_role_counts = np.zeros(3, dtype=np.float32)
    for key, sys_roles, grt_roles in yield_paired_predicates(sys_df, grt_df):
        local_arg, local_qna, local_role = evaluate(sys_roles, grt_roles)

        unlabelled_arg_counts += np.array(local_arg.as_tuple())
        labelled_arg_counts += np.array(local_qna.as_tuple())
        unlabelled_role_counts += np.array(local_role.as_tuple())

    unlabelled_arg_counts = Metrics(*unlabelled_arg_counts)
    labelled_arg_counts = Metrics(*labelled_arg_counts)
    unlabelled_role_counts = Metrics(*unlabelled_role_counts)

    return unlabelled_arg_counts, labelled_arg_counts, unlabelled_role_counts
    def dev_test(self, dev_x, dev_y, word2id, tag2id):
        batches_x, batches_y, batches_seq_len = get_batches(
            dev_x, dev_y, word2id, tag2id, self.bilstm.batch_size)
        pred_lists = []
        labels = []
        id2tag = dict((id_, tag) for tag, id_ in tag2id.items())

        for i in range(len(batches_x)):
            pred_labels = self.pred_labels(batches_x[i], batches_y[i],
                                           batches_seq_len[i])
            for j in range(len(pred_labels)):
                for k in range(batches_seq_len[i][j]):
                    pred_lists.append(id2tag[pred_labels[j][k]])
                    labels.append(id2tag[batches_y[i][j][k]])
        metrics = Metrics(labels, pred_lists)
        metrics.report_scores()
def main():

    print('读取数据...')
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus('train')
    dev_word_lists, dev_tag_lists = build_corpus('dev', maek_vocab = False)
    test_word_lists, test_tag_lists = build_corpus('test', maek_vocab = False)

    print('训练HMM模型...')
    hmm_model = HMMModel(len(tag2id), len(word2id))
    hmm_model.train(train_word_lists, train_tag_lists, word2id, tag2id)
    pred_tag_lists = hmm_model.test(test_word_lists, word2id, tag2id)

    metrics = Metrics(test_tag_lists, pred_tag_lists)
    metrics.report_scores()

    print('训练CRF模型...')
    crf_model = CRFModel(max_iterations = 90)
    crf_model.train(train_word_lists, train_tag_lists)
    pred_tag_lists = crf_model.test(test_word_lists)

    metrics = Metrics(test_tag_lists, pred_tag_lists)
    metrics.report_scores()
    
    
    print('训练BiLSTM模型...')
    word2id, tag2id = extend_maps(word2id, tag2id)
    bilstm = BiLSTM(len(word2id), len(tag2id))
    bilstm.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id, 0.8)
    bilstm.dev_test(test_word_lists, test_tag_lists, word2id, tag2id)
    bilstm.close_sess()
    

    print('训练BiLSTM-CRF模型...')
    bilstm_crf = BiLSTM_CRF(len(word2id), len(tag2id))
    bilstm_crf.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id, 0.8)
    bilstm_crf.dev_test(test_word_lists, test_tag_lists, word2id, tag2id)
    bilstm_crf.close_sess()
DIM = 256
filters = 256
print('num_words = {}, maxlen = {}'.format(num_words, maxlen))

# 数据集和标签
fact = np.load(r"../data/train_word_seg_data{}_numwords{}.npy".format(
    maxlen, num_words))
labels = np.load(r"../data/train_label_from_zero_onehot.npy")
fact_train, fact_test, labels_train, labels_test = train_test_split(
    fact, labels, test_size=0.1, random_state=1)
del labels
del fact
gc.collect()
print("data have been loaded")

metrics = Metrics()
data_input = Input(shape=[maxlen])
word_vec = Embedding(input_dim=num_words + 1,
                     input_length=maxlen,
                     output_dim=DIM,
                     mask_zero=0,
                     name='Embedding')(data_input)
x = Bidirectional(CuDNNGRU(filters, return_sequences=True))(word_vec)
x = Bidirectional(CuDNNGRU(filters, return_sequences=True))(x)
x = GlobalMaxPooling1D()(x)
x = BatchNormalization()(x)
x = Dense(labels_train.shape[1], activation="sigmoid")(x)
model = Model(inputs=data_input, outputs=x)
model.summary()
model = multi_gpu_model(model, gpus=2)
adam = keras.optimizers.adam(lr=0.0001)