コード例 #1
0
def feedback(query_docs_point_dict, query_model, doc_unigram, doc_wordcount,
             general_model, background_model, topN):
    lambda_bg = 0.1
    lambda_fb = 0.8
    lambda_ir_fb = 0.2
    lambda_q = 0.1
    specific_model = {}
    for q_key, docs_point_list in query_docs_point_dict.items():
        feedback_doc = {}
        feedback_doc_wc = {}
        # Extract feedback document
        for doc_name in docs_point_list[0:topN]:
            feedback_doc[doc_name] = copy.deepcopy(doc_unigram[doc_name])
            feedback_doc_wc[doc_name] = copy.deepcopy(doc_wordcount[doc_name])
        # generate specific model
        specific_model = specific_modeling(dict(feedback_doc))
        # generate significant model
        significant_model = significant_modeling(general_model, specific_model,
                                                 feedback_doc, feedback_doc_wc)
        '''
        ir_feedback_doc = {}
        ir_feedback_doc_wc = {}
		# Extract irrelevant feedback document 
        for doc_name, point in docs_point_list[len(docs_point_list)-topN:]:
            ir_feedback_doc[doc_name] = doc_unigram[doc_name]
            ir_feedback_doc_wc[doc_name] = doc_wordcount[doc_name]
        # generate specific model    
        ir_specific_model = specific_modeling(dict(ir_feedback_doc))
        # generate significant model
        ir_significant_model = significant_modeling(general_model, ir_specific_model, ir_feedback_doc, ir_feedback_doc_wc)
        '''
        for word, fb_w_prob in significant_model.items():
            original_prob = 0.0
            if word in query_model[q_key]:
                original_prob = query_model[q_key][word]
            else:
                original_prob = 0.0
            # update query unigram
            query_model[q_key][word] = (lambda_q * original_prob) + (
                lambda_fb * fb_w_prob) + (lambda_bg * background_model[word])
        '''
        for word, ir_fb_w_prob in ir_significant_model.items():
            if word in query_model[q_key]:
                query_model[q_key][word] = (1 - lambda_ir_fb) * query_model[q_key][word] + lambda_ir_fb * ir_fb_w_prob
        '''
        query_model[q_key] = ProcDoc.softmax(dict(query_model[q_key]))
    query_model, query_IDs = ProcDoc.dict2np(query_model)
    # plot_diagram.plotModel(general_model, specific_model, significant_model, feedback_doc_wc, feedback_doc)

    return [query_model, query_IDs]
コード例 #2
0
query_lambda = 0
doc_lambda = 0.9
#remove_list = ["update_embedded_query_expansion_ci.pkl", "update_embedded_query_expansion_qi.pkl", "collection_embedded.pkl", "query_embedded.pkl", "collection_total_similarity.pkl"]
remove_list = []

document_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW"
query_path = "../Corpus/TDT2/QUERY_WDID_NEW_middle"
word_emb_path = "data/word2vec_dict.pkl"
relevance_path = "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt"

# document model
data = ProcDoc.read_file(document_path)
doc_wordcount = ProcDoc.doc_preprocess(data)
doc_unigram = ProcDoc.unigram(dict(doc_wordcount))
doc_mdl, doc_IDs = ProcDoc.dict2np(doc_unigram)
# background_model
background_model = ProcDoc.read_background_dict()
background_model_np = ProcDoc.read_background_np()

# document smoothing
for doc_idx in xrange(doc_mdl.shape[0]):
    doc_vec = doc_mdl[doc_idx]
    doc_mdl[doc_idx] = (
        1 - doc_lambda) * doc_vec + doc_lambda * background_model_np

# general model
collection = {}
collection_total_similarity = {}
for key, value in doc_wordcount.items():
    for word, count in value.items():
コード例 #3
0
query_lambda = 0.36
doc_lambda = 0.82
remove_list = [
    "update_embedded_query_expansion_ci.pkl",
    "update_embedded_query_expansion_qi.pkl", "collection_embedded.pkl",
    "query_embedded.pkl", "collection_total_similarity.pkl"
]

document_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW"
query_path = "../Corpus/TDT2/QUERY_WDID_NEW"

# document model
data = ProcDoc.read_file(document_path)
doc_wordcount = ProcDoc.doc_preprocess(data)
doc_unigram = ProcDoc.unigram(dict(doc_wordcount))
doc_mdl, doc_IDs = ProcDoc.dict2np(doc_unigram)
# background_model
background_model = ProcDoc.read_background_dict()
background_model_np = ProcDoc.read_background_np()

# document smoothing
for doc_idx in xrange(doc_mdl.shape[0]):
    doc_vec = doc_mdl[doc_idx]
    doc_mdl[doc_idx] = (
        1 - doc_lambda) * doc_vec + doc_lambda * background_model_np

# general model
collection = {}
collection_total_similarity = {}
for key, value in doc_wordcount.items():
    for word, count in value.items():