def main(args): exp_path = args["exp_path"] isTraining = args["isTraining"] model_name = args["model_name"] if isTraining: data_path = "data/Train" rel_path = "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain" else: data_path = "data/Test" rel_path = "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt" # Read data with open(data_path + "/qry_IDs.pkl", "rb") as f: qry_IDs = pickle.load(f) with open(data_path + "/doc_IDs.pkl", "rb") as f: doc_IDs = pickle.load(f) qry_tf = np.load(data_path + "/x_qry_tf_mdl.npy") doc = np.load(data_path + "/doc_mdl.npy") #mean = np.load(exp_path + "/mean.npy") #stdv = np.load(exp_path + "/stdv.npy") #valid_idx = np.nonzero(stdv) # Load model model = load_model(exp_path + "/" + model_name) # Evaluation evaluate_model = EvaluateModel(rel_path, isTraining) with tf.device('/device:GPU:0'): # Train #qry_tf[:, valid_idx] = (qry_tf[:, valid_idx] - mean[valid_idx]) / stdv[valid_idx] rel_mdl = model.predict(qry_tf) #rel_mdl = rel_mdl * stdv[-1] + mean[-1] qry_docs_ranking = cosineFast(rel_mdl, qry_IDs, doc, doc_IDs) mAP = evaluate_model.mAP(qry_docs_ranking) print(mAP)
def __init__(self, qry_path=None, rel_path=None, isTraining=True, doc_path=None): # default training step if qry_path == None: qry_path = "../Corpus/TDT2/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if doc_path == None: doc_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW" if rel_path == None: rel_path = "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain" self.vocab_size = 51253 # relevance set self.rel_set = ProcDoc.readRELdict(rel_path, isTraining) self.evaluate_model = EvaluateModel(rel_path, isTraining) # read documents doc = ProcDoc.readFile(doc_path) self.doc = ProcDoc.docPreproc(doc) self.doc_len = Statistical.compLenAcc(self.doc) # read queries qry = ProcDoc.readFile(qry_path) self.qry_tf = ProcDoc.qryPreproc(qry, self.rel_set) self.qry_len = Statistical.compLenAcc(self.qry_tf) [self.qry, self.doc] = Statistical.TFIDF(self.qry_tf, self.doc, self.qry_len, self.doc_len) # dict to numpy self.qry_tf, self.qry_tf_IDs = self.__dict2np(self.qry_tf) self.qry, self.qry_IDs = self.__dict2np(self.qry, self.qry_tf_IDs) self.doc, self.doc_IDs = self.__dict2np(self.doc) # precompute len(document) self.doc = Statistical.l2Normalize(self.doc)
def precision(y_true, y_pred): """Precision metric. Only computes a batch-wise average of precision. Computes the precision, a metric for multi-label classification of how many selected items are relevant. """ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) return precision input_data_process = InputDataProcess(NUM_OF_FEATS, MAX_QRY_LENGTH, MAX_DOC_LENGTH) #, test_path) evaluate_model = EvaluateModel( "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain", True) # Parameters params = { 'input_data_process': input_data_process, 'dim_x': MAX_QRY_LENGTH, 'dim_y': MAX_DOC_LENGTH, 'dim_x1': NUM_OF_FEATS, 'batch_size': batch_size, 'shuffle': False } [partition, labels, partition_answer] = input_data_process.genTrainValidSet(percent) # Generators training_generator = DataGenerator(**params).generate(labels,
query_model = query_unigram Pickle.dump(query_model, open("model/query_model.pkl", "wb"), True) # remove template file for rm_file in remove_list: if os.path.isfile("model/" + rm_file): os.remove("model/" + rm_file) # Embedded Query Expansion m_list = np.linspace(4, 4, num=1) m = 1 interpolated_aplpha_list = np.linspace(0, 1.0, num=11) word2vec = word2vec_model.word2vec_model(word_emb_path) embd = EmbeddedBased(query_wordcount, collection, word2vec) evaluate_model = EvaluateModel(relevance_path) EQE1 = [] EQE2 = [] print "Embedded..." # Embedding-based system (hyperparameter) tmp_eqe1 = embd.embedded_query_expansion_ci(0.4, 4) tmp_eqe2 = embd.embedded_query_expansion_qi(0.4, 4) tmp_eqe1 = ProcDoc.modeling(tmp_eqe1, background_model, query_lambda) tmp_eqe2 = ProcDoc.modeling(tmp_eqe2, background_model, query_lambda) EQE1.append([ProcDoc.dict2np(tmp_eqe1), tmp_eqe1]) EQE2.append([ProcDoc.dict2np(tmp_eqe2), tmp_eqe2]) Pickle.dump(EQE1, open("model/eqe1_10.pkl", "wb"), True) Pickle.dump(EQE2, open("model/eqe2_10.pkl", "wb"), True) ''' EQE1 = Pickle.load(open("model/eqe1_10.pkl", "rb"))
query_unigram = ProcDoc.unigram(dict(query_wordcount)) query_model = query_unigram Pickle.dump(query_model, open("model/query_model.pkl", "wb"), True) ''' # remove template file for rm_file in remove_list: if os.path.isfile("model/" + rm_file): os.remove("model/" + rm_file) ''' # Embedded Query Expansion m_list = np.linspace(1, 80, num=1) m = 1 interpolated_aplpha_list = np.linspace(0, 1.0, num=11) word2vec = word2vec_model.word2vec_model() evaluate_model = EvaluateModel( "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt") EQE1 = [] EQE2 = [] for m in m_list: [tmp_eqe1, tmp_eqe2] = Embedded_based.EmbeddedQuery(query_wordcount, collection, word2vec, 1, int(m)) tmp_eqe1 = ProcDoc.modeling(tmp_eqe1, background_model, query_lambda) tmp_eqe2 = ProcDoc.modeling(tmp_eqe2, background_model, query_lambda) EQE1.append(ProcDoc.dict2np(tmp_eqe1)) EQE2.append(ProcDoc.dict2np(tmp_eqe2)) Pickle.dump(EQE1, open("model/eqe1_10.pkl", "wb"), True) Pickle.dump(EQE2, open("model/eqe2_10.pkl", "wb"), True) ''' EQE1 = Pickle.load(open("model/eqe1_10.pkl", "rb"))
query_unigram = ProcDoc.unigram(query_wordcount) query_model = ProcDoc.modeling(query_unigram, background_model, query_lambda) ''' for q, w_uni in query_model.items(): if q in HMMTraingSetDict: continue else: query_model.pop(q, None) print(len(query_model.keys())) ''' # query process print("query ...") evaluate_model = EvaluateModel(rel_path) query_docs_point_fb = {} query_model_fb = {} mAP_list = [] for step in xrange(1, 15): query_docs_dict = {} AP = 0 mAP = 0 for q_key, q_word_prob in query_model.items(): print(step - 1, end='\r') docs_point = {} for doc_key, doc_words_prob in doc_unigram.items(): point = 0 # calculate each query value for the document for query_word, query_prob in q_word_prob.items():
TraingSet_path = "../Corpus/ResultsTrainSet/ResultsTrainSet.txt" TraingSetDict = defaultdict(list) with open(TraingSet_path, 'r') as file: # read content of query document (doc, content) title = "" for line in file.readlines(): result = line.split() if len(result) == 0: continue if len(result) > 2: title = result[2] continue TraingSetDict[title].append(result[0]) eval = EvaluateModel( "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt", False) assess = eval.getAss() P_R_table = np.zeros(11) for q_key, results in TraingSetDict.items(): p_max = 0 num_correct = 0. start_recall = 0 recall_acc = 0. for pos, doc_name in enumerate(results): t_pos = pos + 1 if doc_name in assess[q_key]: num_correct += 1. precision = num_correct / t_pos recall_acc = num_correct / len(assess[q_key]) if precision > p_max: p_max = precision