Beispiel #1
0
@Project   : DuReader
@Module    : similar_info.py
@Author    : Deco [[email protected]]
@Created   : 8/15/18 1:33 PM
@Desc      : 
"""
import pprint
import time

import numpy as np

from work4.elasticsearch2.extract_person import search_data_match
from work4.logger_setup import define_logger
from work4.similar_sentence.one_vs_group import st_st_similarity, most_similar

logger = define_logger('work4.similar_sentence.similar_info')

label_dict = {
    '梅西 跑动和进球': '梅西 跑动数据, 梅西 跑动速度; 梅西 刷新 历史进球纪录',
    '梅西 暑假': '世界足坛 明星们 过 暑假;'
}
# labels = ['梅西 跑动和进球', '梅西 暑假']

label_st = {
    '梅西 跑动和进球': [
        '9球,梅西刷新自己的甘伯杯历史进球纪录; ',
        '阿根廷跑动数据:梅西7.6公里; 对阵克罗地亚全场84%的时间梅西的跑动速度均在7km/h以下。'
    ],
    '梅西 暑假': ['梅西逗狗,二弟玩鹰!来看看世界足坛明星们都是怎么过暑假的']
}
Beispiel #2
0
"""
@Project   : DuReader
@Module    : prepare_paragraphs.py
@Author    : Deco [[email protected]]
@Created   : 8/20/18 3:41 PM
@Desc      : 
"""
import pickle
import pprint
import time

from work4.elasticsearch2.extract_person import search_data_match
from work4.logger_setup import define_logger

logger = define_logger('work3.prepare_paragraphs')

if __name__ == '__main__':

    asked_sentences = search_data_match()
    logger.info('The context sentences for the questions:')
    time.sleep(0.5)
    pprint.pprint(asked_sentences)

    with open('paragraphs_messi.pkl', 'wb') as handle:
        pickle.dump(asked_sentences, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""
@Project   : DuReader
@Module    : dataset.py
@Created   : 7/23/18 5:50 PM
@Desc      :
"""

import json
from collections import Counter

import jieba
import numpy as np
from work4.logger_setup import define_logger
logger = define_logger('work3.dataset')


class BRCDataset:
    """
    This module implements the APIs for loading and using baidu reading
    comprehension dataset
    """
    def __init__(self,
                 max_p_num,
                 max_p_len,
                 max_q_len,
                 train_files=[],
                 dev_files=[],
                 test_files=[]):
        # p: paragraph? q: question?
        self.logger = logger
        self.max_p_num = max_p_num
import time
from concurrent.futures import ProcessPoolExecutor
from io import StringIO

import jieba
import numpy as np
import gensim
from cachetools import cached, TTLCache
from gensim.models.word2vec import LineSentence

from work4.logger_setup import define_logger

base_dir = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
cache = TTLCache(maxsize=100, ttl=300)
logger = define_logger('work4.similar_sentence.one_vs_group')

# 把句子转换为向量


def model_load():
    fn = os.path.join(base_dir, "wiki-word2vec/data/wiki.zh.model")
    model0 = gensim.models.Word2Vec.load(fn)
    logger.debug('The model was loaded.')
    return model0


model = model_load()


def avg_pooling(word_vectors: list) -> list:
Beispiel #5
0
    from work4.logger_setup import define_logger
    module_dataset = import_module('.dataset', package='work3')
    module_vocab = import_module('.vocab', package='tensorflow2')
    module_rc_model = import_module('.rc_model', package='work3')

    BRCDataset = getattr(module_dataset, 'BRCDataset')
    Vocab = getattr(module_vocab, 'Vocab')
    RCModel = getattr(module_rc_model, 'RCModel')

os.chdir(os.path.join(base_dir, 'work3'))
# 改变当前目录,因为后面要用到父目录,祖父目录
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
# 设置环境变量,控制tensorflow的log level

logger = define_logger('work3.ask_and_answer_messi')

with open('paragraphs_messi.pkl', 'rb') as handle:
    asked_sentences = pickle.load(handle)

logger.info('The context sentences for the questions:')
pprint.pprint(asked_sentences)

questions = ['梅西做了什么'] * len(asked_sentences)
# 梅西怎么了


def parse_args():
    """
    Parses command line arguments.
    """