@Project : DuReader @Module : similar_info.py @Author : Deco [[email protected]] @Created : 8/15/18 1:33 PM @Desc : """ import pprint import time import numpy as np from work4.elasticsearch2.extract_person import search_data_match from work4.logger_setup import define_logger from work4.similar_sentence.one_vs_group import st_st_similarity, most_similar logger = define_logger('work4.similar_sentence.similar_info') label_dict = { '梅西 跑动和进球': '梅西 跑动数据, 梅西 跑动速度; 梅西 刷新 历史进球纪录', '梅西 暑假': '世界足坛 明星们 过 暑假;' } # labels = ['梅西 跑动和进球', '梅西 暑假'] label_st = { '梅西 跑动和进球': [ '9球,梅西刷新自己的甘伯杯历史进球纪录; ', '阿根廷跑动数据:梅西7.6公里; 对阵克罗地亚全场84%的时间梅西的跑动速度均在7km/h以下。' ], '梅西 暑假': ['梅西逗狗,二弟玩鹰!来看看世界足坛明星们都是怎么过暑假的'] }
""" @Project : DuReader @Module : prepare_paragraphs.py @Author : Deco [[email protected]] @Created : 8/20/18 3:41 PM @Desc : """ import pickle import pprint import time from work4.elasticsearch2.extract_person import search_data_match from work4.logger_setup import define_logger logger = define_logger('work3.prepare_paragraphs') if __name__ == '__main__': asked_sentences = search_data_match() logger.info('The context sentences for the questions:') time.sleep(0.5) pprint.pprint(asked_sentences) with open('paragraphs_messi.pkl', 'wb') as handle: pickle.dump(asked_sentences, handle, protocol=pickle.HIGHEST_PROTOCOL)
""" @Project : DuReader @Module : dataset.py @Created : 7/23/18 5:50 PM @Desc : """ import json from collections import Counter import jieba import numpy as np from work4.logger_setup import define_logger logger = define_logger('work3.dataset') class BRCDataset: """ This module implements the APIs for loading and using baidu reading comprehension dataset """ def __init__(self, max_p_num, max_p_len, max_q_len, train_files=[], dev_files=[], test_files=[]): # p: paragraph? q: question? self.logger = logger self.max_p_num = max_p_num
import time from concurrent.futures import ProcessPoolExecutor from io import StringIO import jieba import numpy as np import gensim from cachetools import cached, TTLCache from gensim.models.word2vec import LineSentence from work4.logger_setup import define_logger base_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) cache = TTLCache(maxsize=100, ttl=300) logger = define_logger('work4.similar_sentence.one_vs_group') # 把句子转换为向量 def model_load(): fn = os.path.join(base_dir, "wiki-word2vec/data/wiki.zh.model") model0 = gensim.models.Word2Vec.load(fn) logger.debug('The model was loaded.') return model0 model = model_load() def avg_pooling(word_vectors: list) -> list:
from work4.logger_setup import define_logger module_dataset = import_module('.dataset', package='work3') module_vocab = import_module('.vocab', package='tensorflow2') module_rc_model = import_module('.rc_model', package='work3') BRCDataset = getattr(module_dataset, 'BRCDataset') Vocab = getattr(module_vocab, 'Vocab') RCModel = getattr(module_rc_model, 'RCModel') os.chdir(os.path.join(base_dir, 'work3')) # 改变当前目录,因为后面要用到父目录,祖父目录 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # 设置环境变量,控制tensorflow的log level logger = define_logger('work3.ask_and_answer_messi') with open('paragraphs_messi.pkl', 'rb') as handle: asked_sentences = pickle.load(handle) logger.info('The context sentences for the questions:') pprint.pprint(asked_sentences) questions = ['梅西做了什么'] * len(asked_sentences) # 梅西怎么了 def parse_args(): """ Parses command line arguments. """