is_training = args.is_training is_short = args.is_short is_spoken = args.is_spoken path = CommonPath(is_training, is_short, is_spoken) log_filename = path.getLogFilename() qry_path = path.getQryPath() doc_path = path.getDocPath() rel_path = path.getRelPath() dict_path = path.getDictPath() bg_path = path.getBGPath() print("Vector-Space-Model") # read relevant set for queries and documents eval_mdl = Evaluate.EvaluateModel(rel_path, is_training) rel_set = eval_mdl.getAset() # Preprocess for queries and documents qry_file = ProcDoc.readFile(qry_path) doc_file = ProcDoc.readFile(doc_path) # Term Frequency qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set) doc_mdl_dict = ProcDoc.docPreproc(doc_file) # Convert dictionary to numpy array (feasible to compute) qry_mdl_np_, qry_IDs = ProcDoc.dict2npSparse(qry_mdl_dict) doc_mdl_np_, doc_IDs = ProcDoc.dict2npSparse(doc_mdl_dict) # TF-IDF
def ID2Word(proc_dict, ID_map, score_dict): att_dict = defaultdict(list) for key, content in proc_dict.items(): for i, ID in enumerate(content): content[i] = ID_map[ID] if ID == -1: continue att_score = score_dict[key][ID] for j in range(len(ID_map[ID])): att_dict[key].append(att_score) return proc_dict, att_dict # read relevant set for queries and documents eval_mdl = Evaluate.EvaluateModel(rel_path, False) rel_set = eval_mdl.getAset() # read queris and documents qry_file = ProcDoc.readFile(qry_path) doc_file = ProcDoc.readFile(doc_path) # preprocess + reserve postion infomation qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set, True, True) doc_mdl_dict = ProcDoc.docPreproc(doc_file, True, True) # bag of word qry_bow_dict = ProcDoc.qryPreproc(qry_file, rel_set) doc_bow_dict = ProcDoc.docPreproc(doc_file) # unigram