# a list of training configurations (feature name only for each configuration #train_cfgs=[("MF-e.MF-i.bm25_scores.indri_scores",data_cfg.trn_list_fn,"train")] #train_cfgs=[("bm25_scores.indri_scores",data_cfg.trn_list_fn,"train"), # ("MF-e.MF-i", data_cfg.trn_list_fn, "train")] ''' train_cfgs=[("bm25_scores",data_cfg.all_list_fn,"train"), ("indri_scores",data_cfg.all_list_fn,"train"), ("MF-e",data_cfg.all_list_fn,"train"), ("MF-i",data_cfg.all_list_fn,"train")] ''' train_cfgs = [("embed_sim", data_cfg.trn_list_fn, "train")] model_type = "gnb" cand_list = gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn) def load_feat_and_labels(qid_list, cfg): feat_name = cfg[0] feat_root_path = os.path.join(data_cfg.feat_root_path, feat_name) ret_feats = [] ret_labels = [] for qid in qid_list: print "Loading feature: ", qid feat_fn = os.path.join(feat_root_path, str(qid) + ".npz") if not os.path.exists(feat_fn): print "Warning: feature not exist:", qid continue gt_token = data_utils.load_quaser_gt_by_id(qid)
from boom.modules import Module from multiprocessing import Pool from nltk import word_tokenize from nltk.corpus import stopwords import utils.gen_utils as gen_utils import numpy as np stop_words = set(stopwords.words('english')) candidate_list = gen_utils.read_lines_from_text_file("data/candidate.lst") model = gen_utils.read_dict_from_pkl( "models/MF-e.MF-i.bm25_scores.indri_scores/train/linear_svm/model.pkl") def multi_process_helper(args): q_and_context_list = args[0] ret_list = [] print "In prediction: ", len(q_and_context_list), type(q_and_context_list) id = 0 for q_and_context in q_and_context_list: id += 1 print "In prediction, ind ", id, "/", len(q_and_context_list) feat = np.asarray(q_and_context['final_feat']) feat = np.expand_dims(feat, axis=0) pred_label = candidate_list[model.predict(feat)[0]] final_meta = {} final_meta['pred'] = pred_label final_meta['q_meta'] = q_and_context['question']