コード例 #1
0
def calc_vsm_perform(similarity_func=calc_inner_product):
    if similarity_func.__name__ not in [
            calc_cosine.__name__, calc_inner_product.__name__,
            calc_jaccard.__name__
    ]:
        print('错误的输入相似度计算函数...')
        return
    print('正在加载训练集的预处理文件...')
    if file_exists(preprocess_path):
        res_lst = read_json(preprocess_path)  # 加载训练集初步处理后的文件
    else:
        res_lst = read_json(train_path)  # 加载训练集源文件
        for question in res_lst:
            question['question'] = seg_line(question['question'])
        write_json(preprocess_path, res_lst)

    print('正在计算相似度...')
    res = {}
    for item in res_lst:
        q_words, pid = {}, item['pid']
        for word in item['question']:
            q_words[word] = q_words.get(word, 0) + 1
        query_dic = {
            word: idf.get(word, 0) * (1 + log(tf, 10))
            for word, tf in q_words.items()
        }
        pred_pid = similarity_func(query_dic)[0][0]
        res[item['qid']] = int(pred_pid) == pid
        print('进度: %.2f%%' % (len(res) / len(res_lst) * 100))
    return len(list(filter(lambda val: res[val], res))) / len(res)
コード例 #2
0
def calc_line(query, similarity_func=calc_inner_product):  # 计算一个查询的相似度
    if similarity_func.__name__ not in [
            calc_cosine.__name__, calc_inner_product.__name__,
            calc_jaccard.__name__
    ]:
        print('错误的输入相似度计算函数...')
        return
    query_dic = {word: idf.get(word, 0) for word in seg_line(query)}
    return similarity_func(query_dic)
コード例 #3
0
def load_data():  # 加载问题分类训练和测试数据
    two_items = [(train_question_path, [], []), (test_question_path, [], [])]
    for path, x_data, y_data in two_items:
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                if len(line) > 1:
                    [label, line] = line.strip().split('\t')
                    x_data.append(' '.join(seg_line(line)))
                    y_data.append(label)
    return two_items[0][1], two_items[0][2], two_items[1][1], two_items[1][2]
コード例 #4
0
def evaluate():
    res_lst, bleu_val, predict_lst, truth_lst = get_train_labels(), 0, [], []
    for item in res_lst:
        ans_lst, truth_val = seg_line(
            item['answer_sentence'][0]), item['answer']
        predict_val = get_ans(item['question'], item['label'], ans_lst)
        bleu = bleu1(predict_val, truth_val)
        bleu_val += bleu
        predict_lst.append(predict_val)
        truth_lst.append(truth_val)
    return bleu_val / len(res_lst), exact_match(predict_lst, truth_lst)
コード例 #5
0
def load_train_dev(dev=0.1,
                   update=False):  # 生成训练集和验证集,并将其按照rank-svm数据格式要求写入到文件中
    if file_exists(train_feature_path) and file_exists(
            dev_feature_path) and not update:
        return
    else:
        seg_passages, res_lst, feature_lst = load_seg_passages(), read_json(
            train_path), []
        for item in res_lst:  # 遍历train.json文件中的每一行query信息
            qid, pid, q_words, ans_words_lst, features = item['qid'], item['pid'], seg_line(item['question']), \
                                                         [seg_line(line) for line in item['answer_sentence']], []

            tf_idf_vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
            tf_idf_vec.fit_transform(' '.join(word_lst)
                                     for word_lst in seg_passages[str(pid)])

            for word_lst in seg_passages[str(pid)]:
                value = 3 if word_lst in ans_words_lst else 0  # 排序用的值 todo
                feature = ' '.join(get_features(q_words, word_lst, tf_idf_vec))
                features.append('%d qid:%d %s' % (value, qid, feature))
            feature_lst.append(features)
        feature_lst.sort(
            key=lambda lst: int(lst[0].split()[1].split(':')[1]))  # 按照qid排序
        dev_num = int(dev * len(feature_lst))
        train_features, test_features = feature_lst[:-dev_num], feature_lst[
            -dev_num:]

        # 导出训练集和测试集
        with open(train_feature_path, 'w',
                  encoding='utf-8') as f1, open(dev_feature_path,
                                                'w',
                                                encoding='utf-8') as f2:
            f1.write('\n'.join([
                feature for feature_lst in train_features
                for feature in feature_lst
            ]))
            f2.write('\n'.join([
                feature for feature_lst in test_features
                for feature in feature_lst
            ]))
        return train_features, test_features
コード例 #6
0
def predict(similarity_func=calc_inner_product
            ):  # 对测试集进行预测,要求在此函数前必须执行了vsm_init()函数.
    if similarity_func.__name__ not in [
            calc_cosine.__name__, calc_inner_product.__name__,
            calc_jaccard.__name__
    ]:
        print('错误的输入相似度计算函数...')
        return
    test_lst = read_json(test_path)
    for q_item in test_lst:
        q_item['question'] = seg_line(q_item['question'])  # 分词
        q_item['pid'] = int(
            similarity_func(
                {word: idf.get(word, 0)
                 for word in q_item['question']})[0][0])
    write_json(test_predict_path, test_lst)