print("Vector-Space-Model") # read relevant set for queries and documents eval_mdl = Evaluate.EvaluateModel(rel_path, is_training) rel_set = eval_mdl.getAset() # Preprocess for queries and documents qry_file = ProcDoc.readFile(qry_path) doc_file = ProcDoc.readFile(doc_path) # Term Frequency qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set) doc_mdl_dict = ProcDoc.docPreproc(doc_file) # Convert dictionary to numpy array (feasible to compute) qry_mdl_np_, qry_IDs = ProcDoc.dict2npSparse(qry_mdl_dict) doc_mdl_np_, doc_IDs = ProcDoc.dict2npSparse(doc_mdl_dict) # TF-IDF print("TF-IDF") [qry_mdl_np, doc_mdl_np] = Statistical.TFIDF(qry_mdl_np_, doc_mdl_np_, {"qry":[3, 3], "doc": [3, 3]}) # Cosine Similarity # L2-normalize qry_mdl_np = Statistical.l2Norm(qry_mdl_np) doc_mdl_np = Statistical.l2Norm(doc_mdl_np) def retrieval(qry_mdl, doc_mdl): print("Retrieval") ranking = -np.dot(qry_mdl, doc_mdl.T) results = np.argsort(ranking, axis=1)
rel_set = eval_mdl.getAset() alpha = 0.8 beta = 0.4 qry_file = ProcDoc.readFile(qry_path) doc_file = ProcDoc.readFile(doc_path) qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set) doc_mdl_dict = ProcDoc.docPreproc(doc_file) qry_unimdl_dict = ProcDoc.unigram(qry_mdl_dict) doc_unimdl_dict = ProcDoc.unigram(doc_mdl_dict) # origin query model qry_mdl_np, qry_IDs = ProcDoc.dict2npSparse(qry_unimdl_dict) # refine query model ref_qry_mdl_np, qry_IDs = ProcDoc.dict2npSparse(qry_unimdl_dict) doc_mdl_np, doc_IDs = ProcDoc.dict2npSparse(doc_unimdl_dict) NRM_mdl_np = nn_model.predict(nn_method, qry_mdl_np) bg_mdl_np = ProcDoc.readBGnp(bg_path) # smoothing for doc_idx in range(doc_mdl_np.shape[0]): doc_mdl_np[doc_idx] = (1 - alpha) * doc_mdl_np[doc_idx] + alpha * bg_mdl_np # smoothing for qry_idx in range(qry_mdl_np.shape[0]): qry_mdl_np[qry_idx] = (1 - beta) * qry_mdl_np[qry_idx] + beta * bg_mdl_np