Ejemplo n.º 1
0
 def __init__(self,
              entity2relations_dict='data/entity2relations_dict.pkl',
              seqPair2similarity_dict='data/seqPair2similarity_dict.pkl'):
     self._entity2relations = self._load_dict(entity2relations_dict)
     self._seqPair2similarity = self._load_dict(seqPair2similarity_dict)
     self._similarity_dict_path = seqPair2similarity_dict
     self._relation_paths_dict_path = entity2relations_dict
     self._model = BertSim()
     self._model.mode = tf.estimator.ModeKeys.PREDICT
Ejemplo n.º 2
0
    def __init__(self):
        #加载微调过的文本匹配模型
        self.simmer = BertSim()
        self.tokenizer = BertTokenizer.from_pretrained(BERT_ID)
        self.device = torch.device('cuda:0')
        self.simmer.load_state_dict(torch.load('../data/model/similarity.pt'))
        self.simmer.to(self.device)
        print('bert相似度匹配模型加载完成')

        print('tuple extractor loaded')
Ejemplo n.º 3
0
 def __init__(self):
     try:
         self.entity2relations_dic = pickle.load(
             open('../data/entity2relation_dic.pkl', 'rb'))
     except:
         self.entity2relations_dic = {}
     try:
         self.sentencepair2sim = pickle.load(
             open('../data/sentencepair2sim_dic.pkl', 'rb'))
     except:
         self.sentencepair2sim = {}
     self.simmer = BertSim()
     self.simmer.set_mode(tf.estimator.ModeKeys.PREDICT)
     print('tuples extractor loaded')
 def __init__(self):
     
     #加载一些缓存
     try:
         self.entity2relations_dic = pickle.load(open('../data/entity2relation_dic.pkl','rb'))
     except:
         self.entity2relations_dic = {}
         
     #加载基于tensorflow的微调过的文本匹配模型    
     self.simmer = BertSim()
     self.simmer.set_mode(tf.estimator.ModeKeys.PREDICT)
     print ('bert相似度匹配模型加载完成')
     #加载简单-复杂问题分类模型
     #self.question_classify_model = get_model()
     print ('问题分类模型加载完成')
     print ('tuples extractor loaded')
Ejemplo n.º 5
0
class TupleExtractor(object):
    def __init__(self):
        try:
            self.entity2relations_dic = pickle.load(
                open('../data/entity2relation_dic.pkl', 'rb'))
        except:
            self.entity2relations_dic = {}
        try:
            self.sentencepair2sim = pickle.load(
                open('../data/sentencepair2sim_dic.pkl', 'rb'))
        except:
            self.sentencepair2sim = {}
        self.simmer = BertSim()
        self.simmer.set_mode(tf.estimator.ModeKeys.PREDICT)
        print('tuples extractor loaded')

    def extract_tuples(self, candidate_entitys, question):
        ''''''
        candidate_tuples = {}

        for entity in candidate_entitys:
            #得到该实体的所有关系路径
            starttime = time.time()

            relations = GetRelationPaths(entity)

            mention = candidate_entitys[entity][0]
            for r in relations:

                this_tuple = tuple([entity] + r)  #生成候选tuple
                predicates = [relation[1:-1]
                              for relation in r]  #python-list 关系名列表

                human_question = '的'.join([mention] + predicates)

                score = [entity] + [s for s in candidate_entitys[entity][0:1]
                                    ]  #初始化特征

                try:
                    sim2 = self.sentencepair2sim[question + human_question]
                except:
                    sim2 = self.simmer.predict(question, human_question)[0][1]
                    self.sentencepair2sim[question + human_question] = sim2
                self.sentencepair2sim[question + human_question] = sim2
                score.append(sim2)

                candidate_tuples[this_tuple] = score
            print('====查询候选关系并计算特征耗费%.2f秒====' % (time.time() - starttime))

        return candidate_tuples

    def GetCandidateAns(self, corpus):
        '''根据mention,得到所有候选实体,进一步去知识库检索候选答案
        候选答案格式为tuple(entity,relation1,relation2) 这样便于和标准答案对比
        '''
        true_num = 0
        hop2_num = 0
        hop2_true_num = 0
        all_tuples_num = 0
        for i in range(len(corpus)):
            dic = corpus[i]
            question = dic['question']
            gold_tuple = dic['gold_tuple']
            gold_entitys = dic['gold_entitys']
            candidate_entitys = dic['candidate_entity_filter']

            candidate_tuples = self.extract_tuples(candidate_entitys, question)
            print(i)
            print(question)
            all_tuples_num += len(candidate_tuples)
            dic['candidate_tuples'] = candidate_tuples

            #判断gold tuple是否包含在candidate_tuples_list中
            if_true = 0
            for thistuple in candidate_tuples:
                if len(gold_tuple) == len(
                        set(gold_tuple).intersection(set(thistuple))):
                    if_true = 1
                    break
            if if_true == 1:
                true_num += 1
                if len(gold_tuple) <= 3 and len(gold_entitys) == 1:
                    hop2_true_num += 1
            if len(gold_tuple) <= 3 and len(gold_entitys) == 1:
                hop2_num += 1

        print('所有问题里,候选答案能覆盖标准查询路径的比例为:%.3f' % (true_num / len(corpus)))
        print('单实体问题中,候选答案能覆盖标准查询路径的比例为:%.3f' % (hop2_true_num / hop2_num))
        print('平均每个问题的候选答案数量为:%.3f' % (all_tuples_num / len(corpus)))
        pickle.dump(self.entity2relations_dic,
                    open('../data/entity2relation_dic.pkl', 'wb'))
        pickle.dump(self.sentencepair2sim,
                    open('../data/sentencepair2sim_dic.pkl', 'wb'))
        return corpus
Ejemplo n.º 6
0
class TupleExtractor(object):
    def __init__(self):

        #加载一些缓存
        try:
            self.entity2relations_dic = pickle.load(
                open('../data/entity2relation_dic.pkl', 'rb'))
        except:
            self.entity2relations_dic = {}

        #加载基于tensorflow的微调过的文本匹配模型
        self.simmer = BertSim()
        self.simmer.set_mode(tf.estimator.ModeKeys.PREDICT)
        print('bert相似度匹配模型加载完成')
        #加载简单-复杂问题分类模型
        #self.question_classify_model = get_model()
        print('问题分类模型加载完成')
        print('tuples extractor loaded')

    def extract_tuples(self, candidate_entitys, question):
        ''''''
        candidate_tuples = {}
        entity_list = candidate_entitys.keys()  #得到有序的实体列表
        inputs = []  #获取所有候选路径的BERT输入
        for entity in entity_list:
            #得到该实体的所有关系路径
            starttime = time.time()
            relations = GetRelationPaths(entity)
            mention = candidate_entitys[entity][0]
            for r in relations:
                predicates = [relation[1:-1]
                              for relation in r]  #python-list 关系名列表
                human_question = '的'.join([mention] + predicates)
                inputs.append((question, human_question))

        #将所有路径输入BERT获得分数
        print('====共有{}个候选路径===='.format(len(inputs)))
        bert_scores = []
        batch_size = 128
        if len(inputs) % batch_size == 0:
            num_batches = len(inputs) // batch_size
        else:
            num_batches = len(inputs) // batch_size + 1
        starttime = time.time()
        for i in range(num_batches):
            begin = i * batch_size
            end = min(len(inputs), (i + 1) * batch_size)
            self.simmer.input_queue.put(inputs[begin:end])
            prediction = self.simmer.output_queue.get()
            bert_scores.extend(
                [prediction[i][1] for i in range(len(prediction))])
        print('====为所有路径计算特征耗费%.2f秒====' % (time.time() - starttime))

        index = 0
        for entity in entity_list:
            #得到该实体的所有关系路径
            starttime = time.time()
            relations = GetRelationPaths(entity)
            mention = candidate_entitys[entity][0]
            for r in relations:
                this_tuple = tuple([entity] + r)  #生成候选tuple
                score = [entity] + candidate_entitys[entity]  #初始化特征
                sim2 = bert_scores[index]
                index += 1
                score.append(sim2)
                candidate_tuples[this_tuple] = score
            print('====得到实体%s的所有候选路径及其特征====' % (entity))

        return candidate_tuples

    def GetCandidateAns(self, corpus):
        '''根据mention,得到所有候选实体,进一步去知识库检索候选答案
        候选答案格式为tuple(entity,relation1,relation2) 这样便于和标准答案对比
        '''
        true_num = 0
        hop2_num = 0
        hop2_true_num = 0
        all_tuples_num = 0
        for i in range(len(corpus)):
            dic = corpus[i]
            question = dic['question']
            gold_tuple = dic['gold_tuple']
            gold_entitys = dic['gold_entitys']
            candidate_entitys = dic['candidate_entity_filter']
            print(i)
            print(question)
            candidate_tuples = self.extract_tuples(candidate_entitys, question)
            all_tuples_num += len(candidate_tuples)
            dic['candidate_tuples'] = candidate_tuples

            #判断gold tuple是否包含在candidate_tuples_list中
            if_true = 0
            for thistuple in candidate_tuples:
                if len(gold_tuple) == len(set(gold_tuple) & set(thistuple)):
                    if_true = 1
                    break
            if if_true == 1:
                true_num += 1
                if len(gold_tuple) <= 3 and len(gold_entitys) == 1:
                    hop2_true_num += 1
            if len(gold_tuple) <= 3 and len(gold_entitys) == 1:
                hop2_num += 1

        print('所有问题里,候选答案能覆盖标准查询路径的比例为:%.3f' % (true_num / len(corpus)))
        print('单实体问题中,候选答案能覆盖标准查询路径的比例为:%.3f' % (hop2_true_num / hop2_num))
        print('平均每个问题的候选答案数量为:%.3f' % (all_tuples_num / len(corpus)))
        pickle.dump(self.entity2relations_dic,
                    open('../data/entity2relation_dic.pkl', 'wb'))
        return corpus
Ejemplo n.º 7
0
import numpy as np
import pandas as pd
import urllib.request
import urllib.parse
import tensorflow as tf
from db import load_data_kudu
from global_config import Logger

sys.path.append('/home/mqq/zwshi/bert/')
from similarity import BertSim
# 模块导入 https://blog.csdn.net/xiongchengluo1129/article/details/80453599

loginfo = Logger("recommend_articles.log", "info")
file = "./NERdata/q_t_a_testing_predict.txt"

bs = BertSim()
bs.set_mode(tf.estimator.ModeKeys.PREDICT)


def dataset_test():
    '''
    用训练问答对中的实体+属性,去知识库中进行问答测试准确率上限
    :return:
    '''
    with open(file) as f:
        total = 0
        recall = 0
        correct = 0

        for line in f:
            question, entity, attribute, answer, ner = line.split("\t")
Ejemplo n.º 8
0
class TupleExtractor(object):
    def __init__(self):
        #加载微调过的文本匹配模型
        self.simmer = BertSim()
        self.tokenizer = BertTokenizer.from_pretrained(BERT_ID)
        self.device = torch.device('cuda:0')
        self.simmer.load_state_dict(torch.load('../data/model/similarity.pt'))
        self.simmer.to(self.device)
        print('bert相似度匹配模型加载完成')

        print('tuple extractor loaded')

    def extract_tuples(self, candidate_entitys, question, entity2relations):
        ''''''
        candidate_tuples = {}
        entity_list = candidate_entitys.keys()  # 得到有序的实体列表
        count, st = 0, time.time()

        for entity in entity_list:
            mention = candidate_entitys[entity][0]
            relations = entity2relations[entity]
            for r in relations:
                #python-list 关系名列表
                predicates = [relation[1:-1] for relation in r]
                human_question = '的'.join([mention] + predicates)
                logits = predict(self.simmer, self.tokenizer, self.device,
                                 question, human_question)
                sim = logits[0][1].item()

                this_tuple = tuple([entity] + r)  # e, [r|r1, r2]
                # [entity, mention, feats]
                feature = [entity] + candidate_entitys[entity] + [sim]
                candidate_tuples[this_tuple] = feature
                count += 1

        print('====共有{}个候选路径===='.format(count))
        print('====为所有路径计算特征耗费%.2f秒====' % (time.time() - st))

        return candidate_tuples

    def get_candidate_ans(self, corpus):
        '''根据mention,得到所有候选实体,进一步去知识库检索候选答案
        候选答案格式为tuple(entity,relation1,relation2) 这样便于和标准答案对比
        '''
        true_num = 0
        hop2_num = 0
        hop2_true_num = 0
        all_tuples_num = 0

        relation_list, st = [], time.time()
        for i, item in enumerate(corpus):
            print(i)
            candidate_entity = item['candidate_entity_filter']
            entity_relation = dict()
            for e in candidate_entity:
                ret = get_relation_paths(e)
                entity_relation[e] = ret
                print('实体: %s查找到%d候选路径' % (e, len(ret)))
            relation_list.append(entity_relation)
            print()
        print('查询时间开销:%.2fs' % (time.time() - st))

        for i in range(len(corpus)):
            dic = corpus[i]
            question = dic['question']
            gold_entities = dic['gold_entities']
            gold_relations = dic['gold_relations']
            gold_tuple = tuple(gold_entities + gold_relations)
            candidate_entitys = dic['candidate_entity_filter']
            relations = relation_list[i]
            print(i)
            print(question)
            candidate_tuples = self.extract_tuples(candidate_entitys, question,
                                                   relations)
            all_tuples_num += len(candidate_tuples)
            dic['candidate_tuples'] = candidate_tuples
            corpus[i] = dic

            #判断gold tuple是否包含在candidate_tuples_list中
            if_true = 0
            for thistuple in candidate_tuples:
                if len(gold_tuple) == len(set(gold_tuple) & set(thistuple)):
                    if_true = 1
                    break
            if if_true == 1:
                true_num += 1
                if len(gold_tuple) <= 3 and len(gold_entities) == 1:
                    hop2_true_num += 1
            if len(gold_tuple) <= 3 and len(gold_entities) == 1:
                hop2_num += 1

        print('所有问题里,候选答案能覆盖标准查询路径的比例为:%.3f' % (true_num / len(corpus)))
        print('单实体问题中,候选答案能覆盖标准查询路径的比例为:%.3f' % (hop2_true_num / hop2_num))
        print('平均每个问题的候选答案数量为:%.3f' % (all_tuples_num / len(corpus)))

        return corpus
Ejemplo n.º 9
0
from similarity import BertSim
import tensorflow as tf

bs = BertSim()
bs.set_mode(tf.estimator.ModeKeys.TRAIN)
bs.train()
Ejemplo n.º 10
0
class AnswerCandidate(Candidate):
    def __init__(self,
                 entity2relations_dict='data/entity2relations_dict.pkl',
                 seqPair2similarity_dict='data/seqPair2similarity_dict.pkl'):
        self._entity2relations = self._load_dict(entity2relations_dict)
        self._seqPair2similarity = self._load_dict(seqPair2similarity_dict)
        self._similarity_dict_path = seqPair2similarity_dict
        self._relation_paths_dict_path = entity2relations_dict
        self._model = BertSim()
        self._model.mode = tf.estimator.ModeKeys.PREDICT

    def _similarity_of(self, faked, seq):
        k = faked + seq
        if k not in self._seqPair2similarity:
            self._seqPair2similarity[k] = self._model.predict(faked, seq)
        return self._seqPair2similarity[k]

    def _relation_paths_of(self, entity):
        if entity not in self._entity2relations:
            return []
        return self._entity2relations[entity]

    def _candidates_of(self, entity2feats, question):
        answer2feats = {}
        for entity, feats in entity2feats.items():
            relation_paths = self._relation_paths_of(entity)
            if not relation_paths:
                continue
            mention = feats[0]
            for relations in relation_paths:
                answer = (entity, *relations)
                predicates = [spo[1:-1] for spo in relations]
                hypothesis = '的'.join([mention] + predicates)
                feats = [
                    entity, mention,
                    self._similarity_of(hypothesis, question)
                ]
                answer2feats[answer] = feats
        return answer2feats

    def candidates_of(self, subject2feats: Dict[str, list], question: str):
        return self._candidates_of(subject2feats, question)

    def add_candidates_to_corpus(self, corpus: Corpus):
        num_answers = .0
        num_2hop = .0
        num_cover = {'all': .0, '2hop': .0}
        for i, sample in enumerate(corpus):
            question = sample['question']
            gold_answer = sample['gold_tuple']
            gold_entities = sample['gold_entitys']
            subject_linked = sample['subject_linked']
            candidate_answers = self._candidates_of(subject_linked, question)
            num_answers += len(candidate_answers)
            sample['candidate_answer'] = candidate_answers
            ever_cover = False
            for answer in candidate_answers:
                if set(answer).issuperset(gold_answer):
                    ever_cover = True
                    print('* Question: ({}){}\n*\tAnswer: {}'.format(
                        i, question, answer))
                    break
            if ever_cover:
                num_cover['all'] += 1
                if len(gold_answer) <= 3 and len(gold_entities) == 1:
                    num_cover['2hop'] += 1
            if len(gold_answer) <= 3 and len(gold_entities) == 1:
                num_2hop += 1
            # if i >  500 and i % 500 == 0:
            #     print(">>> Caching query dict... <<< ")
            #     self.cache_similarity_query()
            #     self.cache_relation_paths()
        print("* For {}".format(corpus.name))
        print('* Cover ratio in all questions: {:.2f}'.format(
            num_cover['all'] / len(corpus)))
        print('* Cover ratio in single-entity questions: {:.2f}'.format(
            num_cover['2hop'] / num_2hop))
        print('* Averaged candidates per question: {:.2f}'.format(num_answers /
                                                                  len(corpus)))
        return corpus

    def cache_similarity_query(self):
        with open(self._similarity_dict_path, 'wb') as f:
            pickle.dump(self._seqPair2similarity, f)

    def cache_relation_paths(self):
        with open(self._relation_paths_dict_path, 'wb') as f:
            pickle.dump(self._entity2relations, f)