Esempio n. 1
0
def assert_eval(input_entity_id, input_rel_id, input_entity_relation_value, input_text_raw):
    i_batch_num = input_entity_id.shape[0]
    for ib in range(i_batch_num):
        print(input_text_raw[ib])
        entity_id = [entity_bio_id2encoder[e] for e in input_entity_id[ib]]
        entity_e_list = extract_entity(entity_id)
        print(entity_e_list)
        entity_map = {e_value[1] - 1: e_value for e_value in entity_e_list}
        print(entity_map)

        rel_metric = input_rel_id[ib].numpy()
        rel_list = []
        for iv, o_rel_id_row in enumerate(rel_metric):
            if iv not in entity_map:
                continue
            sub_iv = entity_map[iv]
            for jv, o_rel in enumerate(o_rel_id_row):
                if o_rel == 0:
                    continue
                if iv == jv:
                    continue
                if jv not in entity_map:
                    continue
                obj_jv = entity_map[jv]
                one = (int(sub_iv[2]), sub_iv[0], sub_iv[1]-1,  int(obj_jv[2]), obj_jv[0], obj_jv[1]-1, o_rel)
                rel_list.append(one)
        print(rel_list)

        print(input_entity_relation_value[ib])
Esempio n. 2
0
def evaluation(input_sentence_list, input_y, input_model: PointerNetworkModel):
    input_sentence_id = [[char2id.get(c, 1) for c in sentence]
                         for sentence in input_sentence_list]

    eval_batch_num = 100
    batch_value = int(len(input_sentence_list) // eval_batch_num) + 1
    predict_res = []
    for b in range(batch_value):
        batch_data = input_sentence_id[b * eval_batch_num:b * eval_batch_num +
                                       eval_batch_num]
        if len(batch_data) == 0:
            continue
        batch_data = tf.keras.preprocessing.sequence.pad_sequences(
            batch_data, padding="post", maxlen=max_len)
        start_logits, end_logits, mask = input_model(batch_data)
        # mask = tf.expand_dims(mask, -1)
        mask = tf.cast(mask, dtype=tf.float32).numpy()
        start_logits_argmax = tf.argmax(start_logits, axis=-1).numpy() * mask
        end_logits_argmax = tf.argmax(end_logits, axis=-1).numpy() * mask

        for i, start_row in enumerate(start_logits_argmax):
            predict_row = []
            t_iv = 0
            for j, start_v in enumerate(start_row):
                if j < t_iv:
                    continue
                if start_v == 0:
                    continue
                for k, end_v in enumerate(end_logits_argmax[i]):
                    if k < j:
                        continue
                    if start_v == end_v:
                        predict_row.append((j, k + 1, start_v))
                        t_iv = k + 1
                        break
            predict_res.append(predict_row)
    assert len(predict_res) == len(input_y)
    hit_num = 0.0
    predict_num = 0.0
    true_num = 0.0
    for i, predict_row in enumerate(predict_res):
        true_set = set([(si, ei, tag2id[e])
                        for si, ei, e in extract_entity(input_y[i])])
        predict_num += len(predict_row)
        true_num += len(true_set)
        print(predict_row)
        print(true_set)
        for p in predict_row:
            if p in true_set:
                hit_num += 1

    recall = (hit_num + 1e-8) / (true_num + 1e-3)
    precision = (hit_num + 1e-8) / (predict_num + 1e-3)
    f1_value = 2 * recall * precision / (recall + precision)
    print("recall {0}, precision {1} f1-value {2}".format(
        recall, precision, f1_value))
Esempio n. 3
0
def evaluation(input_char_id, input_word_id, input_entity_relation_value, input_model):
    o_entity_logits, o_rel_logits, _ = input_model(input_char_id, input_word_id)
    o_entity_id = tf.argmax(o_entity_logits, axis=-1)

    o_rel_ids = tf.argmax(o_rel_logits, axis=-1)
    i_batch_num = o_entity_logits.shape[0]
    hit_num = 0.0
    real_count = 0.0
    predict_count = 0.0
    for ib in range(i_batch_num):
        entity_id = [entity_bio_id2encoder[e] for e in o_entity_id[ib].numpy()]
        entity_e_list = extract_entity(entity_id)
        print("entity_num {}".format(len(entity_e_list)))

        entity_map = {e_value[1]-1: e_value for e_value in entity_e_list}
        # print("entity", entity_e_list)
        # entity_e_list = [ for s, e, si in entity_e_list]
        o_rel_id = o_rel_ids[ib].numpy()
        rel_list = []
        real_count += len(input_entity_relation_value[ib])

        real_data_set = set(input_entity_relation_value[ib])
        for iv, o_rel_id_row in enumerate(o_rel_id):
            if iv not in entity_map:
                continue
            sub_iv = entity_map[iv]
            for jv, o_rel in enumerate(o_rel_id_row):
                if o_rel == 0:
                    continue
                if iv == jv:
                    continue
                if jv not in entity_map:
                    continue
                obj_jv = entity_map[jv]
                if (int(sub_iv[2]), o_rel, int(obj_jv[2])) not in triple_regularity:
                    continue
                one = (int(sub_iv[2]), sub_iv[0], sub_iv[1]-1,  int(obj_jv[2]), obj_jv[0], obj_jv[1]-1, o_rel)
                rel_list.append(one)
                predict_count += 1
                if one in real_data_set:
                    hit_num += 1
        print("relation", rel_list)
        print("real", input_entity_relation_value[ib])
    res = {
        "hit_num": hit_num,
        "real_count": real_count,
        "predict_count":  predict_count
    }

    return res
    return [word2features(sent, i) for i in range(len(sent))]


X_train = [sent2features(s) for s in sentence_list]
y_train = [sent2labels(s) for s in label_list]
crf_mode = CRFNerModel()
crf_mode.fit(X_train, y_train)

for ti, test_sentence_in in enumerate(test_sentence_list):
    X_test = [sent2features(s) for s in test_sentence_in]

    predict_labels = crf_mode.predict_list(X_test)

    for ii, p_label in enumerate(predict_labels):

        true_entity = extract_entity(p_label)
        # print(test_sentence_list[1][ii])

        for s, e, _ in true_entity:
            print("contract num {} sentence num {}".format(ti, ii),
                  test_sentence_in[ii])
            print("contract num {} info".format(ti), test_sentence_in[ii][s:e])

schema_list = [{
    "entity_name": "owner_subject",
    "entity_cn_name": "甲方"
}, {
    "entity_name": "contract_name",
    "entity_cn_name": "合同名称"
}, {
    "entity_name": "other_subject",
Esempio n. 5
0
    def predict(self,
                inputs,
                batch_size=None,
                verbose=0,
                steps=None,
                callbacks=None,
                max_queue_size=10,
                workers=1,
                use_multiprocessing=False):

        i_batch_num = inputs.shape[0]
        input_id = self.embed(inputs)

        sentence_maxpool = tf.reduce_max(input_id, axis=1)
        sentence_feature = self.event_lstm_layer(sentence_maxpool)
        sentence_feature = tf.reshape(sentence_feature, (i_batch_num, -1))
        sentence_entity_feature = tf.map_fn(
            lambda x: self.entity_lstm_layer(x), input_id, dtype=tf.float32)
        sentence_entity_label = self.entity_output(sentence_entity_feature)
        event_label = self.event_classifier(sentence_feature)

        batch_res = []
        pg_entity_label = tf.argmax(sentence_entity_label, axis=-1)
        for bi, pg_inner_label in enumerate(pg_entity_label):
            sentence_value = input_id[bi]
            batch_event_label = event_label[bi]
            event_with_argument_list = []
            batch_event_label_value = batch_event_label.numpy()
            batch_event_res = [
                i for i, bel in enumerate(batch_event_label_value) if bel > 0.5
            ]

            entity_mask = [[0 for _ in range(max_len)]]
            entity_list = [(0, 0, 0, 0)]
            entity_sentence_loc = [0]
            pg_inner_label = pg_inner_label.numpy()
            for si, sentence_inner_label in enumerate(pg_inner_label):
                sentence_inner_label_value = [
                    id2entity_label[si] for si in sentence_inner_label
                ]
                pred_entity = extract_entity(sentence_inner_label_value)

                for ss, se, ee in pred_entity:
                    entity_one_mask = [
                        1 if ss <= vi < se else 0 for vi in range(max_len)
                    ]
                    entity_mask.append(entity_one_mask)
                    entity_list.append((si, ss, se, argument_role2id[ee]))
                    entity_sentence_loc.append(si)

            entity_mask_value = tf.cast(entity_mask, dtype=tf.float32)
            entity_mask_value = tf.expand_dims(entity_mask_value, axis=-1)
            entity_sentence_loc = tf.cast(entity_sentence_loc, dtype=tf.int64)

            entity_loc_sentence = tf.gather(sentence_value,
                                            entity_sentence_loc)

            entity_feature = tf.multiply(entity_loc_sentence,
                                         entity_mask_value)
            entity_feature = tf.reduce_max(entity_feature, axis=1)

            event_info_collect = dict()
            event_path_feature = []
            event_id_list = []

            def func(input_dict, input_path, ind, input_e_id):
                if len(input_dict) == ind:
                    event_path_feature.append(input_path)
                    event_id_list.append(input_e_id)
                    return
                if not input_dict[ind]:
                    func(input_dict, input_path + [0], ind + 1, input_e_id)
                else:
                    for inner_ind in input_dict[ind]:
                        func(input_dict, input_path + [inner_ind], ind + 1,
                             input_e_id)

            for ei in batch_event_res:
                argument_role = event2argument[ei]
                event_info_collect.setdefault(
                    ei, {ini: []
                         for ini in range(len(argument_role))})
                for eii, et in enumerate(entity_list):
                    if et[3] in argument_role:
                        event_info_collect[ei][argument_role[et[3]]].append(
                            eii)

                func(event_info_collect[ei], [], 0, ei)
            if event_id_list:
                event_id_list_embed = tf.cast(event_id_list, dtype=tf.int64)
                event_path_feature_pad = tf.keras.preprocessing.sequence.pad_sequences(
                    event_path_feature,
                    padding="post",
                    maxlen=max_argument_len)
                event_argument_feature = tf.map_fn(
                    lambda x: tf.gather(entity_feature, x),
                    event_path_feature_pad,
                    dtype=tf.float32)
                event_argument_feature = tf.reshape(
                    event_argument_feature,
                    (event_argument_feature.shape[0], -1))
                event_id_feature = self.event_embed(event_id_list_embed)

                event_feature = tf.concat(
                    [event_id_feature, event_argument_feature], axis=1)

                event_trigger_label = self.event_is_valid(event_feature)

                event_trigger_label_value = event_trigger_label.numpy()
                for i, e_trigger_value in enumerate(event_trigger_label_value):
                    if e_trigger_value[0] > 0.5:
                        event_path_feature_value = [
                            entity_list[ev] for ev in event_path_feature[i]
                        ]
                        event_with_argument_list.append(
                            (event_id_list[i], event_path_feature_value))

            batch_res.append(event_with_argument_list)
        return batch_res
Esempio n. 6
0
        if s not in char2id:
            char2id[s] = len(char2id)
        sentence_id.append(char2id[s])
    if len(sentence_id) > max_len:
        max_len = len(sentence_id)
    msra_train_id.append(sentence_id)

tag2id = {"pad": 0}
tag_list = msra_data.train_tag_list
msra_start_data = []
msra_end_data = []
msra_span_data = []
for i, tag in enumerate(tag_list):
    tag_start = np.zeros(len(tag))
    tag_end = np.zeros(len(tag))
    et = extract_entity(tag)
    for si, ei, e in et:
        if e not in tag2id:
            tag2id[e] = len(tag2id)
        tag_start[si] = tag2id[e]
        tag_end[ei - 1] = tag2id[e]
    msra_start_data.append(tag_start)
    msra_end_data.append(tag_end)

train_data = tf.keras.preprocessing.sequence.pad_sequences(msra_train_id,
                                                           padding="post",
                                                           maxlen=max_len)
label_start_data = tf.keras.preprocessing.sequence.pad_sequences(
    msra_start_data, padding="post", maxlen=max_len)
label_end_data = tf.keras.preprocessing.sequence.pad_sequences(msra_end_data,
                                                               padding="post",
Esempio n. 7
0
        if len(test_train) == 0:
            continue

        test_train_dataset, (train_data_t, label_data_t,
                             mask_data_t) = generate(test_train_data,
                                                     test_train_label)

        att_model = ATTBertModel(bert_model_name, 4)
        att_model.fit(train_data_t, label_data_t, mask_data_t)

        # _, (train_data_v, label_data_v, mask_data_v) = generate(test_eval_data, test_eval_label)
        logits, text_lens = att_model.predict(train_data_t, mask_data_t)
        print(label_data_t)
        paths = []
        for logit, text_len in zip(logits, text_lens):
            viterbi_path, _ = viterbi_decode(logit[:text_len],
                                             att_model.transition_params)
            paths.append(viterbi_path)
        id2label = {0: "O", 1: "B-E", 2: "I-E", 3: "O"}
        paths2label = [[id2label[p] for p in path] for path in paths]
        extract_info = [[(ee[0], test_train_data[i][ee[0]:ee[1]])
                         for ee in extract_entity(path)]
                        for i, path in enumerate(paths2label)]
        # extract_info1 = [[(ee[0], test_train_data[i][ee[0]:ee[1]]) for ee in extract_entity(path)] for i, path in
        #                 enumerate(paths2label)]
        print(extract_info)

        print(hard_score_res_v2(test_train_label, extract_info))

    break