def get_wmd(model, s1, s2):
    """
    使用gensim词向量模型计算文档迁移距离
    :param model: 模型
    :param s1: 句子1
    :param s2: 句子2
    :return:
    """
    words_s1, tag_s1 = seg_doc(s1)
    words_s2, tag_s2 = seg_doc(s2)

    wmd = model.wmdistance(words_s1, words_s2)
    return wmd
Beispiel #2
0
    def _sort_retrieval_docs(self, query, triple_docs):
        """
        对三元组进行重排
        :param query:问句
        :param triple_docs:检索得到的三元组
        :return:
        """
        self.debug('>>> start _sort_retrieval_docs <<<')
        filter_triple_docs = []
        query_words, query_tags = seg_doc(query)
        for doc_item in triple_docs:
            target_field = self.query_fields[0]
            attribute = doc_item.get(target_field, "")
            attribute_words = attribute.strip().split()

            # 需要一个分类器,判断问句与文档是否相关,目前使用word move distance
            score = self._is_similarity(query_words,
                                        attribute_words)  # 计算问句与属性间的文档迁移距离
            if score > DEFAULT_WMD_THRESHOLD:  # 过滤掉得分低的三元组
                doc_item['score'] = score
                filter_triple_docs.append(doc_item)
        filter_triple_docs.sort(key=lambda x: x['score'],
                                reverse=False)  # 根据得分进行重排
        self.debug('>>> end _sort_retrieval_docs <<<')
        return filter_triple_docs
Beispiel #3
0
 def _sort_docs_by_object(self, sentence, triple_docs):
     """
     基于文档迁移距离对triple_docs进行重排
     :param sentence:
     :param triple_docs:
     :return:
     """
     self.debug('>>> start _sort_docs_by_object <<<')
     words, tags = seg_doc(sentence)
     _words = [w.strip() for w in words if w.strip()]
     match_triple_docs = list()  # 满足匹配阈值的triple_docs
     for doc_item in triple_docs:
         item_str = doc_item.get("attribute_date", "")
         item_index = doc_item.get("attribute_date_index", "")
         item_words = [w.strip() for w in item_index.split()]
         distance = calculate_wmd(_words, item_words)  # 计算文档迁移距离
         score = exp(-distance / 19.0)
         doc_item['score'] = score
         if doc_item['score'] > DEFAULT_WMD_THRESHOLD:  # 过滤掉距离大于阈值的三元组
             self.debug('choose item_str=%s, score=%s', item_str,
                        doc_item['score'])
             match_triple_docs.append(doc_item)
         else:
             self.debug("filter item_str=%s, score=%s", item_str,
                        doc_item['score'])
     if match_triple_docs:  # 按score降序排序
         match_triple_docs.sort(key=lambda x: x['score'], reverse=True)
     self.debug('>>> end _sort_docs_by_object <<<')
     return match_triple_docs
def generate_training_data(path):
    file_names = os.listdir(path)

    for name in file_names:
        if os.path.isdir(os.path.join(path, name)):
            print('%s is directory' % name)
            continue
        if name.startswith('seg_'):
            if os.path.exists(os.path.join(path, 'seg_corpus/%s' % name)):
                print("%s seg corpus exists, don't need generate seg_%s" % (name, name))
                continue
            with codecs.open(os.path.join(path, name), mode='r', encoding='utf-8') as fr:
                lines = fr.readlines()

            with codecs.open(os.path.join(path, 'seg_corpus/%s' % name), mode='w', encoding='utf-8') as fw:
                print('start generate seg_corpus/%s' % name)
                for line in tqdm(lines):
                    if line.strip():
                        words = line.strip().split()
                        fw.write(' '.join([w.strip() for w in words if w.strip()]))
                        fw.write('\n')
        else:
            if os.path.exists(os.path.join(path, 'seg_corpus/seg_%s' % name)):
                print("%s seg corpus exists, don't need generate seg_%s" % (name, name))
                continue
            with codecs.open(os.path.join(path, name), mode='r', encoding='utf-8') as fr:
                lines = fr.readlines()

            with codecs.open(os.path.join(path, 'seg_corpus/seg_%s' % name), mode='w', encoding='utf-8') as fw:
                print('start generate seg_corpus/seg_%s' % name)
                for line in tqdm(lines):
                    if line.strip():
                        words, tags = seg_doc(line.strip())
                        fw.write(' '.join([w.strip() for w in words if w.strip()]))
                        fw.write('\n')
Beispiel #5
0
def save_seg_books_txt(corpus_path, target_path):
    with open(corpus_path, 'r') as fr:
        doc = fr.read()
    if doc:
        with open(target_path, 'w') as fw:
            words, flags = seg_doc(doc)
            fw.write(" ".join(words).encode('utf-8'))
    else:
        logger.warn('@@@@@@@@@@@@@@@@@@@@@@@@@@2 read from %s, got None')
Beispiel #6
0
    def _sort_docs_by_subject(self, sentence, triple_docs):
        """
        基于最长公共子串进行排序
        :param sentence:
        :param triple_docs:
        :return:
        """
        self.debug('>>> start _sort_docs_by_subject <<<')
        words, tags = seg_doc(sentence)
        _words = [w.strip() for w in words if w.strip()]
        chosen_triple_docs = []
        for doc_item in triple_docs:
            item_str = doc_item.get('attribute_date', "")
            item_index = doc_item.get("attribute_date_index", "")
            item_words = [w.strip() for w in item_index.split()]

            sub_string, length = longest_common_substring(
                _words, item_words)  # 计算_words与item_words的最长公共子串
            scores = [
                len(sub_string) / float(len(item_words)),
            ]
            if item_str in self.entity_synonym:  # 若target_sentence在主语拓展库中,计算扩展主语与sentence的匹配度
                for extend_str in self.entity_synonym[item_str]:
                    _words, _tags = seg_doc(extend_str)
                    extend_str_words = [w.strip() for w in _words if w.strip()]
                    sub_string, length = longest_common_substring(
                        _words, extend_str_words)
                    scores.append(
                        len(sub_string) / float(len(extend_str_words)))
            doc_item['score'] = max(
                scores)  # 选取target_sentence及扩展主语与sentence的最大匹配分数作为最后分数
            doc_item['length'] = len(item_words)
            if doc_item['score'] >= TRIPLE_MATCH_THRESHOLD:  # 匹配度高于阈值选取该三元组
                self.debug('choose item_str=%s, score=%s, length=%s', item_str,
                           doc_item['score'], doc_item['length'])
                chosen_triple_docs.append(doc_item)
            else:  # 匹配度低于阈值,过滤掉该三元组
                self.debug("filter item_str=%s, score=%s, length=%s", item_str,
                           doc_item['score'], doc_item['length'])
        if chosen_triple_docs:  # 按length降序排序
            chosen_triple_docs.sort(key=lambda x: x['length'], reverse=True)
        self.debug('>>> end _sort_docs_by_subject <<<')
        return chosen_triple_docs
def clear_p_content(content):
    # 清洗模板页面的正则表达式,并抽取每个正则表达式的关键字
    logger.debug('>>> start clear_p_content <<<')
    ret_content = list()
    keywords_list = list()
    for c in content:  # 遍历模板将java格式的正则表达式改写为python格式
        c_text = c.get_text()
        ret_content.append(c_text.replace('(?<', '(?P<'))
        clear_c_text = c_text.replace('(?<title>(.*)?)', '').\
            replace('?<title>', '').\
            replace('.{0,4}', '').\
            replace('.{0,6}', '').\
            replace('(.*)?', '')
        # 抽取正则表达式的中文字符串,用于过滤无关模板
        words_str = clear_c_text.replace('(', ' ').replace(')', ' ').replace(
            '?', ' ').replace('|', ' ')
        words, tags = seg_doc(words_str)
        keywords = " ".join(set([w for w in words if w.strip()]))
        keywords_list.append(keywords)
    logger.debug('>>> end clear_p_content <<<')
    return ret_content, keywords_list
Beispiel #8
0
def write2mongodb(path):
    logger.debug('>>> start write2mongodb <<<')
    triple_docs = load_xlsx(path, start_row=1, start_col=1)
    logger.debug('load from %s, got triple_docs=%s', path, len(triple_docs))
    for doc in triple_docs:
        query = doc[0]
        query_words, query_tags = seg_doc(query)
        answer = doc[1]

        triple_subject = ""
        triple_predicate = ""
        triple_object = ""

        info = {
            "query": query,
            "answer": answer,
            "query_index": " ".join(query_words),
            "triple_subject": triple_subject,
            "triple_predicate": triple_predicate,
            "triple_object": triple_object
        }
        logger.debug('info=%s', json.dumps(info))
        collection.insert(info)
Beispiel #9
0
 def _seg_words(self, sentence):
     words, flags = seg_doc(sentence)
     return words
    MONGODB_BIOLOGY_NODE
from logger import BaseLogger
from utils import seg_doc

client = MongoClient(MONGODB_HOST, MONGODB_PORT)
db = client.get_database(MONGODB_DBNAME)
t_collection = db.get_collection(MONGODB_BIOLOGY_TRIPLE)
n_collection = db.get_collection(MONGODB_BIOLOGY_NODE)

logger = BaseLogger()

node_docs = n_collection.find()  # 读取MONGODB_BIOLOGY_NODE中的数据

logger.debug('start extract triple and write to %s', MONGODB_BIOLOGY_TRIPLE)
for doc in tqdm(node_docs):  # 遍历所有节点,读取节点属性信息,并写入到MONGODB_BIOLOGY_TRIPLE中
    _id = str(doc['_id'])
    for key in doc.keys():
        if key not in ['_id', 'update_time', 'label', 'create_time']:  # 过滤
            if key == 'name':  # 属性为name
                triple = {"node_id": _id,
                          "attribute_name": key,
                          "attribute_date": doc[key]}
            else:  # 属性非name,attribute_date需要进行拼接
                triple = {"node_id": _id,
                          "attribute_name": key,
                          "attribute_date": "\n".join(doc[key])}
            words, tags = seg_doc(triple['attribute_date'])
            triple['attribute_date_index'] = " ".join([w for w in words if w.strip()])
            logger.debug('triple=%s', json.dumps(triple, ensure_ascii=False))
            t_collection.insert(triple)