def calculate(pred_relevance, split_idx): rel_query_model = pred_relevance print type(rel_query_model) print rel_query_model.shape.eval() with open("query_model.pkl", "rb") as file: query_model = Pickle.load(file)[:split_idx] with open("query_list.pkl", "rb") as file: query_list = Pickle.load(file)[:split_idx] with open("doc_model.pkl", "rb") as file: doc_model = Pickle.load(file) with open("doc_list.pkl", "rb") as file: doc_list = Pickle.load(file) #with open("relevance_model_RM.pkl", "rb") as file : rel_query_model = Pickle.load(file) #with open("query_relevance_model_RLE.pkl", "rb") as file : rel_query_model = Pickle.load(file) background_model = ProcDoc.read_background_dict() qry_eval = evaluate.evaluate_model(True) ''' document smoothing ''' for doc_idx in range(doc_model.shape[0]): doc_vec = doc_model[doc_idx] doc_model[doc_idx] = (1 - doc_lambda) * doc_vec + doc_lambda * background_model mAP_list = [] query_rel_list = [] query_bg_list = [] doc_model = np.log(doc_model) doc_model = doc_model for rel_qry_lambda in np.linspace(0, 1., num=11): ''' query smoothing ''' with open("query_model.pkl", "rb") as file: query_model = Pickle.load(file)[:split_idx] X = T.matrix() Y = (1- rel_qry_lambda)*X + rel_qry_lambda * rel_query_model f = theano.function([X], Y) query_model = f(query_model) result = np.argsort(-np.dot(query_model, doc_model.T), axis = 1) query_docs_ranking = {} ''' speedup ''' for q_idx in range(len(query_list)): docs_ranking = [] for doc_idx in result[q_idx]: docs_ranking.append(doc_list[doc_idx]) query_docs_ranking[query_list[q_idx]] = docs_ranking ''' query for query_key, query_vec in zip(query_list, query_model): print len(query_docs_ranking.keys()) query_result = np.argsort(-(query_vec * doc_model).sum(axis = 1)) docs_ranking = [] for doc_`idx in query_result: docs_ranking.append(doc_list[doc_idx]) query_docs_ranking[query_key] = docs_ranking mAP = eval.mean_average_precision(query_docs_ranking) print mAP, qry_lambda, rel_qry_lambda ''' mAP = qry_eval.mean_average_precision(query_docs_ranking) mAP_list.append(mAP) return max(mAP_list)
def __init__(self, query_model): self.query_model = copy.deepcopy(query_model) self.vocabulary_size = 51253 smoothing = 0.1 with open("test_query_list.pkl", "rb") as file: self.query_list = pickle.load(file) with open("doc_list.pkl", "rb") as file: self.doc_list = pickle.load(file) with open("doc_model.pkl", "rb") as file: doc_model = pickle.load(file) self.background_model = ProcDoc.read_background_dict() ''' smoothing ''' for d_idx, doc_vec in enumerate(doc_model): doc_model[d_idx] = ( 1 - smoothing) * doc_vec + smoothing * self.background_model self.doc_model = doc_model
def __init__(self, query_model, isSpoken=False): smoothing = 0.0 with open("test_query_list.pkl", "rb") as file: self.query_list = pickle.load(file) with open("doc_list.pkl", "rb") as file: self.doc_list = pickle.load(file) if isSpoken: with open("doc_model_wc_s.pkl", "rb") as file: doc_model = pickle.load(file) else: with open("doc_model_wc.pkl", "rb") as file: doc_model = pickle.load(file) background_model = ProcDoc.read_background_dict() self.query_model = copy.deepcopy(query_model) self.vocabulary_size = 51253 self.doc_model = copy.deepcopy(doc_model) self.doc_model = doc_model self.background_model = background_model
doc_lambda = 0.9 #remove_list = ["update_embedded_query_expansion_ci.pkl", "update_embedded_query_expansion_qi.pkl", "collection_embedded.pkl", "query_embedded.pkl", "collection_total_similarity.pkl"] remove_list = [] document_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW" query_path = "../Corpus/TDT2/QUERY_WDID_NEW_middle" word_emb_path = "data/word2vec_dict.pkl" relevance_path = "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt" # document model data = ProcDoc.read_file(document_path) doc_wordcount = ProcDoc.doc_preprocess(data) doc_unigram = ProcDoc.unigram(dict(doc_wordcount)) doc_mdl, doc_IDs = ProcDoc.dict2np(doc_unigram) # background_model background_model = ProcDoc.read_background_dict() background_model_np = ProcDoc.read_background_np() # document smoothing for doc_idx in xrange(doc_mdl.shape[0]): doc_vec = doc_mdl[doc_idx] doc_mdl[doc_idx] = ( 1 - doc_lambda) * doc_vec + doc_lambda * background_model_np # general model collection = {} collection_total_similarity = {} for key, value in doc_wordcount.items(): for word, count in value.items(): if word in collection: collection[word] += count
print 'Building a model whose optimizer=adadelta, activation function=softmax' filepath = "Epochs/with_gamma_single_TF_Spk" filename = [] #doc_nn.load_weights("NN_Model/DSSM_WEIGHTS_TD_54_double_gamma_shuffle.h5", by_name=True) emb_model = load_model("NN_Model/RLE_SWLM_E_S.h5") with open("model/UM/test_query_model_short.pkl", "rb") as file:query_model = Pickle.load(file) with open("model/UM/test_query_list_short.pkl", "rb") as file:query_list = Pickle.load(file) with open("model/test_query_model_short.pkl", "rb") as file:query_TF = Pickle.load(file) with open("model/log_doc_model_s.pkl", "rb") as file:doc_model = Pickle.load(file) with open("model/doc_list_s.pkl", "rb") as file:doc_list = Pickle.load(file) bg_md = ProcDoc.read_background_dict() query_model = 0.5 * query_model + 0.5 *np.array(emb_model.predict_on_batch(query_model)) query_model *= query_TF.sum(axis = 1).reshape(16, 1) * 10 #doc_model = doc_model max_mAP = ["", 0] evl = evaluate.evaluate_model(False) for dir_item in os.listdir(filepath): # join dir path and file name dir_item_path = os.path.join(filepath, dir_item) # check whether a file exists before read if os.path.isfile(dir_item_path): dir_item_path = "Epochs/with_gamma_single_TF_Spk/54_log_gamma_shuffle_spk_weights-30-0.11.hdf5" #if dir_item_path.find("log") == -1: continue