def evaluate(self, query, scaling_factor): m = hashlib.md5(str.encode(query+str(self.num_of_papers))) digest = m.hexdigest() filename = "tmp/"+digest if os.path.exists(filename): with open(filename, 'rb') as f: pubmed = pickle.load(f) else: pubmed = PaperProcessor(query, num_of_documents=self.num_of_papers, postprocessing=False) pubmed.add_missing_info() with open(filename, 'wb') as f: pickle.dump(pubmed, f) # pubmed.add_missing_info() # c = 0 # for value in pubmed.papers.values(): # if value['Journal_IF'] == 0: # c += 1 # # print(c) papers = np.array(list(pubmed.papers.values())) logging.info("Got {} papers for query \"{}\", scaling_factor={}".format(len(papers), query, scaling_factor)) all_time_result = 0 times = 1 for i in range(1, times+1): overall_diff = 0 kf = KFold(len(papers), 5, shuffle=True) for train, test in kf: train_set = papers[train] test_set = papers[test] for train_sample in train_set: score = len(papers) - train_sample["Ranking"] + 1 # Gaussian noise # Standard Variation: Score/5 score = 1 if score == 0 else abs(score) noise = np.random.normal(0, score / 5) score *= scaling_factor if self.noise: score += noise train_sample["Score"] = score # for test_sample in test_set: # test_sample["Score"] = np.random.rand() self.model(train_set, test_set, query) test_set = list(test_set) test_size = len(test_set) test_set.sort(key = lambda x: x["Ranking"]) for idx, test_sample in enumerate(test_set): test_sample["Correct_Ranking"] = idx total_diff = 0 test_set.sort(key = lambda x: x["Score"]) test_set.reverse() # logging.info("Test size: {}".format(test_size)) for idx, test_sample in enumerate(test_set): total_diff += abs(test_sample["Correct_Ranking"] - idx) / test_size ave_diff = total_diff / test_size overall_diff += ave_diff / 5 all_time_result += overall_diff / times # logging.info("{}th trial, Average Difference: {}".format(i, overall_diff)) # logging.info("For all trials, Average Difference: {}".format(all_time_result)) return all_time_result, len(papers)