#print(all_doc_slices[:3]) #print(len(all_que_slices)) #print(all_que_slices[28]) #all_que_slices = all_que_slices[:3] #que_docs = que_docs[:3] #docs_have = preprocess.FetchDocs(c_data, que_docs,rel_docs) docs_have = preprocess.FetchDocs_for_tr(c_data, que_docs, rel_docs) ''' 3.数据碎片化机制 ''' import matchTool ma = matchTool.Match(processed_docs, word2vec, 1) #print(ma.idfmax) #sni1_score_list = ma.getwordsim1(all_doc_slices,all_que_slices,rel_docs,docs_have,file_index,2) #sni2_score_list = ma.getwordsim1(all_doc_slices,all_que_slices,rel_docs,docs_have,file_index,3) #sni1_score_list = ma.getsimwmd(all_doc_slices,all_que_slices,rel_docs,docs_have,file_index,2) #sni2_score_list = ma.getsimwmd(all_doc_slices,all_que_slices,rel_docs,docs_have,file_index,3) #ma = matchTool.Match(processed_docs,[],0) #ma.getsimlmj(all_doc_slices,all_que_slices,rel_docs,docs_have,file_index,1) sni1_score_list = ma.getsimbm25(all_doc_slices, all_que_slices, rel_docs, docs_have, file_index, 3) sni2_score_list = ma.getsimbm25(all_doc_slices, all_que_slices, rel_docs, docs_have, file_index, 2) #sni1_score_list = ma.getsimbm25forfullsearch(all_doc_slices,all_que_slices,file_index,2)
for line in lines: doc_data = eval(line.replace('\n','')) origin_docs.append(doc_data['content']) rel_list.append(doc_data['r']) docno_list.append(doc_data['doc_no']) if doc_data['r'] != "0": #print(doc_data['r']) all_rela += 1 processed_docs = Stem_voca([tokenize(doc) for doc in origin_docs]) all_doc_slices = Puncut(origin_docs) bm25Model = matchTool.Match(processed_docs,idf, [], 0) average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys()) #bm25Sim(self, qu_doc, doc_slice, doc_index,limit_ratio,filter_score) for doc_index,doc_slice in enumerate(all_doc_slices): #limit_ratio = (len(qu_doc) * 1.0 / (len(processed_docs[doc_index]) + 0.000001) )*10 sim_score_snippets,rel_snippet_ratio = bm25Model.bm25Sim(qu_doc, doc_slice, doc_index,0.1, average_idf) score = integScore(sim_score_snippets, rel_snippet_ratio,3) score_list.append((str(doc_index),score)) ''' #bm25Model = bm25.BM25(processed_docs,idf,3) score_dict = {} scores = bm25Model.get_scores_for_vn(qu_doc,average_idf) for idx,score_item in enumerate(scores):
documents = [] with open("../04_groups/" + test_dict['index'], "r", encoding='utf-8') as target_object: lines = list(target_object.readlines()) for line in lines: doc_data = eval(line.replace('\n', '')) if doc_data['r'] != "0": documents.append(doc_data['content']) preprocess = pp.PreProc(list(que_doc), documents) que_docs, processed_docs = preprocess.Normal() ma = matchTool.Match(processed_docs, idf, [], 0) average_idf = sum(map(lambda k: float(ma.idf[k]), ma.idf.keys())) / len( ma.idf.keys()) extend_word = ma.ewGen_occ_trec(que_doc, preprocess.corpus_words, documents, 0.5, 0.3) #ex_words = select_words(ex_words , 2) #print(test_dict['index']," extend words:",ex_words) print("finish:", test_dict['index']) ex_word_dict[test_dict['index']] = extend_word #print(ex_word_dict) with open("../ex/extend_conf" + str(start) + "_" + str(end), "w", encoding='utf-8') as extend_object: