Beispiel #1
0
def tokenize_with_jieba(input_file,
                        stopwords_file=None,
                        for_search=False,
                        output_file=None):
    sentences = read_file(input_file)
    sentences_segs = []
    for sentence in sentences:
        if for_search:
            seg_list = jieba.cut_for_search(sentence)
        else:
            seg_list = jieba.cut(sentence)
        sentences_segs.append(list(seg_list))

    # 如果传入了stopwords_file文件,去停用词
    if stopwords_file:
        stopwords = read_file(stopwords_file)
        segs = []
        for seg_list in sentences_segs:
            results = []
            for seg in seg_list:
                if seg in stopwords:
                    continue
                results.append(seg)
            segs.append(results)
    else:
        segs = sentences_segs

    if output_file:
        segs_out = [str(x) for x in segs]
        write_file(output_file, segs_out, mode='w', encoding='utf-8')
    return segs
Beispiel #2
0
def run_prediction(input_file_path, output_file_path):

    logger.info("run prediction, params: top - %s, "
                "file_questions - %s, file_answers - %s" %
                (str(conf.top), conf.file_questions, conf.file_answers))

    # 读入训练集
    logger.info("read answers from: %s" % conf.file_answers)
    answers = read_file(conf.file_answers)

    # 分词工具
    logger.info("seg_name is : %s" % conf.seg_name)
    if conf.seg_name == 'api':
        seg = ApiSeg(conf.file_stopwords,
                     api_token=conf.api_token,
                     inner=conf.inner)
    elif conf.seg_name == 'jieba':
        # 基于jieba分词,并去除停用词
        seg = JiebaSeg(conf.file_stopwords)
    elif conf.seg_name == 'lac':
        seg = LacSeg(clear_sw=True)
    else:
        raise ValueError("conf.seg_name值错误,可选值['jieba', 'lac', 'api']")

    # 训练模型
    ss = SentenceSimilarity(seg, model_path=conf.model_path)
    if not os.path.exists(os.path.join(conf.model_path, 'tfidf.model')):
        logger.info("refresh model, read questions from: %s" %
                    conf.file_questions)
        questions = read_file(conf.file_questions)
        ss.set_sentences(questions)
    logger.info("starting train model.")
    ss.TfidfModel()

    # 读入测试集
    logger.info("read dev_sentences from: %s" % input_file_path)
    dev_sentences = read_file(input_file_path)

    logger.info("test is running, result will write to %s" % output_file_path)
    with open(output_file_path, 'w', encoding='utf-8') as file_result:
        for i in range(0, len(dev_sentences)):
            top_answers = ss.similarity(dev_sentences[i], top=conf.top)
            logger.info(top_answers)
            top_answers_index = [i[0] for i in top_answers]
            answer_candidates = []
            for a in top_answers_index:
                mid_answer = answers[a]
                answer_candidates.append((mid_answer, len(mid_answer)))
            logger.info(answer_candidates)
            answer = sorted(answer_candidates,
                            key=lambda x: x[1],
                            reverse=True)[0][0]
            file_result.write(str(answer.strip(" ,,")) + '\n')
    logger.info("run prediction success.")
Beispiel #3
0
 def train(self):
     """训练BM25模型"""
     if self.fresh:
         logger.info("重新分词,创建模型")
         segs = self.initialize()
     else:
         logger.info("从%s读入现有分词结果,创建模型" % conf.file_questions_segs)
         segs = read_file(conf.file_questions_segs)
         segs = [eval(x) for x in segs]
     self.model = BM25Model = bm25.BM25(segs)
     self.average_idf = sum(
         map(lambda k: float(BM25Model.idf[k]),
             BM25Model.idf.keys())) / len(BM25Model.idf.keys())
     logger.info("BM25模型创建成功")
Beispiel #4
0
def run_prediction(input_file_path, output_file_path):
    model = BM25Model()
    model.train()

    logger.info('predict most similarity questions in %s' % input_file_path)
    sim_results = model.predict(input_file_path)

    logger.info("read reference answers from %s" % conf.file_answers)
    answers = read_file(conf.file_answers)

    logger.info("result will write to %s" % output_file_path)
    logger.info("max_len_answer is the longest answer of top %s" %
                str(conf.top))
    with open(output_file_path, 'w', encoding='utf-8') as file_result:
        for top_sims in sim_results:
            answer_list = []
            for j in range(0, len(top_sims)):
                answer_index = top_sims[j][0]
                answer = answers[answer_index]
                answer_list.append((answer, len(answer)))
            max_len_answer = sorted(answer_list,
                                    key=lambda x: x[1],
                                    reverse=True)[0][0]
            file_result.write(max_len_answer + '\n')
Beispiel #5
0
 def __init__(self, file_stopwords):
     self.stopwords = read_file(file_stopwords)
Beispiel #6
0
 def __init__(self, file_stopwords, api_token, inner):
     self.stopwords = read_file(file_stopwords)
     self.api_token = api_token
     self.inner = inner