Esempio n. 1
0
#print(all_doc_slices[:3])
#print(len(all_que_slices))
#print(all_que_slices[28])

#all_que_slices = all_que_slices[:3]
#que_docs = que_docs[:3]
#docs_have = preprocess.FetchDocs(c_data, que_docs,rel_docs)

docs_have = preprocess.FetchDocs_for_tr(c_data, que_docs, rel_docs)
'''
3.数据碎片化机制
'''
import matchTool

ma = matchTool.Match(processed_docs, word2vec, 1)
#print(ma.idfmax)
#sni1_score_list = ma.getwordsim1(all_doc_slices,all_que_slices,rel_docs,docs_have,file_index,2)
#sni2_score_list = ma.getwordsim1(all_doc_slices,all_que_slices,rel_docs,docs_have,file_index,3)
#sni1_score_list = ma.getsimwmd(all_doc_slices,all_que_slices,rel_docs,docs_have,file_index,2)
#sni2_score_list = ma.getsimwmd(all_doc_slices,all_que_slices,rel_docs,docs_have,file_index,3)

#ma = matchTool.Match(processed_docs,[],0)
#ma.getsimlmj(all_doc_slices,all_que_slices,rel_docs,docs_have,file_index,1)

sni1_score_list = ma.getsimbm25(all_doc_slices, all_que_slices, rel_docs,
                                docs_have, file_index, 3)
sni2_score_list = ma.getsimbm25(all_doc_slices, all_que_slices, rel_docs,
                                docs_have, file_index, 2)

#sni1_score_list = ma.getsimbm25forfullsearch(all_doc_slices,all_que_slices,file_index,2)
Esempio n. 2
0
     for line in lines:
         doc_data = eval(line.replace('\n',''))
         origin_docs.append(doc_data['content'])
         rel_list.append(doc_data['r'])
         docno_list.append(doc_data['doc_no'])
         if doc_data['r'] != "0":
             #print(doc_data['r'])
             all_rela += 1
             
 processed_docs = Stem_voca([tokenize(doc) for doc in origin_docs])
 all_doc_slices = Puncut(origin_docs)
 
 
 
 
 bm25Model = matchTool.Match(processed_docs,idf, [], 0)
 average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())
 
 #bm25Sim(self, qu_doc, doc_slice, doc_index,limit_ratio,filter_score)
 for doc_index,doc_slice in enumerate(all_doc_slices):
     #limit_ratio = (len(qu_doc) * 1.0 / (len(processed_docs[doc_index]) + 0.000001)   )*10
     sim_score_snippets,rel_snippet_ratio = bm25Model.bm25Sim(qu_doc, doc_slice, doc_index,0.1, average_idf)
     score = integScore(sim_score_snippets, rel_snippet_ratio,3)
     score_list.append((str(doc_index),score))
     
 
 '''
 #bm25Model = bm25.BM25(processed_docs,idf,3)
 score_dict = {}
 scores = bm25Model.get_scores_for_vn(qu_doc,average_idf)
 for idx,score_item in enumerate(scores):
Esempio n. 3
0
    documents = []

    with open("../04_groups/" + test_dict['index'], "r",
              encoding='utf-8') as target_object:
        lines = list(target_object.readlines())
        for line in lines:
            doc_data = eval(line.replace('\n', ''))

            if doc_data['r'] != "0":
                documents.append(doc_data['content'])

    preprocess = pp.PreProc(list(que_doc), documents)
    que_docs, processed_docs = preprocess.Normal()

    ma = matchTool.Match(processed_docs, idf, [], 0)
    average_idf = sum(map(lambda k: float(ma.idf[k]), ma.idf.keys())) / len(
        ma.idf.keys())

    extend_word = ma.ewGen_occ_trec(que_doc, preprocess.corpus_words,
                                    documents, 0.5, 0.3)
    #ex_words = select_words(ex_words , 2)
    #print(test_dict['index']," extend words:",ex_words)
    print("finish:", test_dict['index'])
    ex_word_dict[test_dict['index']] = extend_word

#print(ex_word_dict)

with open("../ex/extend_conf" + str(start) + "_" + str(end),
          "w",
          encoding='utf-8') as extend_object: