def run_one_query(query, num, models, ai=True, noise=False, rank=RankingAI.rank): papers = get_papers_from_pubmed(query, num) # papers = list(PubMedFetcher(query, num_of_documents=10).papers.values()) PaperProcessor.add_journal_if(papers) if ai: rank(papers, query) else: RankingAI.passthrough(papers) # CSVHandler.write_list_to_csv("test.csv", papers, additional_fields=["ReferenceRank"]) if noise: scaling_factors = [1, 0.1, 0.01, 0.001] else: scaling_factors = [1] results = [[] for _ in range(len(models))] times = [0 for _ in range(len(models))] for idx, model in enumerate(models): t = time.time() eva = NewEvaluation(papers, query, model=model, scaling_factors=scaling_factors, noise=noise) time_passed = time.time() - t times[idx] = time_passed results[idx].extend(eva.result) with open('tmp/{}{}.csv'.format(query, len(papers)), 'w', newline='') as csvfile: fieldnames = ['Model No.', 'Description', 'Result'] writer = csv.writer(csvfile) writer.writerow(fieldnames) for idx, result in enumerate(results): row = [re.search(r'(\d+)$', models[idx].__name__).group(0), models[idx].__doc__] + result writer.writerow(row)
import os, sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import pubmed from papers_csv_handler import CSVHandler from paper_processor import PaperProcessor import logging from IPython import embed logging.getLogger().setLevel(logging.DEBUG) papers = list(pubmed.PubMedFetcher("h**o sapiens", num_of_documents=2116).papers.values()) PaperProcessor.add_journal_if(papers) CSVHandler.write_list_to_csv("test.csv", papers)
def evaluate(self, query, scaling_factor): m = hashlib.md5(str.encode(query+str(self.num_of_papers))) digest = m.hexdigest() filename = "tmp/"+digest if os.path.exists(filename): with open(filename, 'rb') as f: pubmed = pickle.load(f) else: pubmed = PaperProcessor(query, num_of_documents=self.num_of_papers, postprocessing=False) pubmed.add_missing_info() with open(filename, 'wb') as f: pickle.dump(pubmed, f) # pubmed.add_missing_info() # c = 0 # for value in pubmed.papers.values(): # if value['Journal_IF'] == 0: # c += 1 # # print(c) papers = np.array(list(pubmed.papers.values())) logging.info("Got {} papers for query \"{}\", scaling_factor={}".format(len(papers), query, scaling_factor)) all_time_result = 0 times = 1 for i in range(1, times+1): overall_diff = 0 kf = KFold(len(papers), 5, shuffle=True) for train, test in kf: train_set = papers[train] test_set = papers[test] for train_sample in train_set: score = len(papers) - train_sample["Ranking"] + 1 # Gaussian noise # Standard Variation: Score/5 score = 1 if score == 0 else abs(score) noise = np.random.normal(0, score / 5) score *= scaling_factor if self.noise: score += noise train_sample["Score"] = score # for test_sample in test_set: # test_sample["Score"] = np.random.rand() self.model(train_set, test_set, query) test_set = list(test_set) test_size = len(test_set) test_set.sort(key = lambda x: x["Ranking"]) for idx, test_sample in enumerate(test_set): test_sample["Correct_Ranking"] = idx total_diff = 0 test_set.sort(key = lambda x: x["Score"]) test_set.reverse() # logging.info("Test size: {}".format(test_size)) for idx, test_sample in enumerate(test_set): total_diff += abs(test_sample["Correct_Ranking"] - idx) / test_size ave_diff = total_diff / test_size overall_diff += ave_diff / 5 all_time_result += overall_diff / times # logging.info("{}th trial, Average Difference: {}".format(i, overall_diff)) # logging.info("For all trials, Average Difference: {}".format(all_time_result)) return all_time_result, len(papers)