def calc_vsm_perform(similarity_func=calc_inner_product): if similarity_func.__name__ not in [ calc_cosine.__name__, calc_inner_product.__name__, calc_jaccard.__name__ ]: print('错误的输入相似度计算函数...') return print('正在加载训练集的预处理文件...') if file_exists(preprocess_path): res_lst = read_json(preprocess_path) # 加载训练集初步处理后的文件 else: res_lst = read_json(train_path) # 加载训练集源文件 for question in res_lst: question['question'] = seg_line(question['question']) write_json(preprocess_path, res_lst) print('正在计算相似度...') res = {} for item in res_lst: q_words, pid = {}, item['pid'] for word in item['question']: q_words[word] = q_words.get(word, 0) + 1 query_dic = { word: idf.get(word, 0) * (1 + log(tf, 10)) for word, tf in q_words.items() } pred_pid = similarity_func(query_dic)[0][0] res[item['qid']] = int(pred_pid) == pid print('进度: %.2f%%' % (len(res) / len(res_lst) * 100)) return len(list(filter(lambda val: res[val], res))) / len(res)
def calc_line(query, similarity_func=calc_inner_product): # 计算一个查询的相似度 if similarity_func.__name__ not in [ calc_cosine.__name__, calc_inner_product.__name__, calc_jaccard.__name__ ]: print('错误的输入相似度计算函数...') return query_dic = {word: idf.get(word, 0) for word in seg_line(query)} return similarity_func(query_dic)
def load_data(): # 加载问题分类训练和测试数据 two_items = [(train_question_path, [], []), (test_question_path, [], [])] for path, x_data, y_data in two_items: with open(path, 'r', encoding='utf-8') as f: for line in f: if len(line) > 1: [label, line] = line.strip().split('\t') x_data.append(' '.join(seg_line(line))) y_data.append(label) return two_items[0][1], two_items[0][2], two_items[1][1], two_items[1][2]
def evaluate(): res_lst, bleu_val, predict_lst, truth_lst = get_train_labels(), 0, [], [] for item in res_lst: ans_lst, truth_val = seg_line( item['answer_sentence'][0]), item['answer'] predict_val = get_ans(item['question'], item['label'], ans_lst) bleu = bleu1(predict_val, truth_val) bleu_val += bleu predict_lst.append(predict_val) truth_lst.append(truth_val) return bleu_val / len(res_lst), exact_match(predict_lst, truth_lst)
def load_train_dev(dev=0.1, update=False): # 生成训练集和验证集,并将其按照rank-svm数据格式要求写入到文件中 if file_exists(train_feature_path) and file_exists( dev_feature_path) and not update: return else: seg_passages, res_lst, feature_lst = load_seg_passages(), read_json( train_path), [] for item in res_lst: # 遍历train.json文件中的每一行query信息 qid, pid, q_words, ans_words_lst, features = item['qid'], item['pid'], seg_line(item['question']), \ [seg_line(line) for line in item['answer_sentence']], [] tf_idf_vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b") tf_idf_vec.fit_transform(' '.join(word_lst) for word_lst in seg_passages[str(pid)]) for word_lst in seg_passages[str(pid)]: value = 3 if word_lst in ans_words_lst else 0 # 排序用的值 todo feature = ' '.join(get_features(q_words, word_lst, tf_idf_vec)) features.append('%d qid:%d %s' % (value, qid, feature)) feature_lst.append(features) feature_lst.sort( key=lambda lst: int(lst[0].split()[1].split(':')[1])) # 按照qid排序 dev_num = int(dev * len(feature_lst)) train_features, test_features = feature_lst[:-dev_num], feature_lst[ -dev_num:] # 导出训练集和测试集 with open(train_feature_path, 'w', encoding='utf-8') as f1, open(dev_feature_path, 'w', encoding='utf-8') as f2: f1.write('\n'.join([ feature for feature_lst in train_features for feature in feature_lst ])) f2.write('\n'.join([ feature for feature_lst in test_features for feature in feature_lst ])) return train_features, test_features
def predict(similarity_func=calc_inner_product ): # 对测试集进行预测,要求在此函数前必须执行了vsm_init()函数. if similarity_func.__name__ not in [ calc_cosine.__name__, calc_inner_product.__name__, calc_jaccard.__name__ ]: print('错误的输入相似度计算函数...') return test_lst = read_json(test_path) for q_item in test_lst: q_item['question'] = seg_line(q_item['question']) # 分词 q_item['pid'] = int( similarity_func( {word: idf.get(word, 0) for word in q_item['question']})[0][0]) write_json(test_predict_path, test_lst)