Beispiel #1
0
def run_one_query(query, num, models, ai=True, noise=False, rank=RankingAI.rank):
    papers = get_papers_from_pubmed(query, num)
    # papers = list(PubMedFetcher(query, num_of_documents=10).papers.values())
    PaperProcessor.add_journal_if(papers)
    if ai:
        rank(papers, query)
    else:
        RankingAI.passthrough(papers)
    # CSVHandler.write_list_to_csv("test.csv", papers, additional_fields=["ReferenceRank"])
    if noise:
        scaling_factors = [1, 0.1, 0.01, 0.001]
    else:
        scaling_factors = [1]

    results = [[] for _ in range(len(models))]
    times = [0 for _ in range(len(models))]
    for idx, model in enumerate(models):
        t = time.time()
        eva = NewEvaluation(papers, query, model=model, scaling_factors=scaling_factors, noise=noise)
        time_passed = time.time() - t
        times[idx] = time_passed
        results[idx].extend(eva.result)
    
    with open('tmp/{}{}.csv'.format(query, len(papers)), 'w', newline='') as csvfile:
        fieldnames = ['Model No.', 'Description', 'Result']
        writer = csv.writer(csvfile)
        writer.writerow(fieldnames)
        for idx, result in enumerate(results):
            row = [re.search(r'(\d+)$', models[idx].__name__).group(0), models[idx].__doc__] + result
            writer.writerow(row)
Beispiel #2
0
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pubmed
from papers_csv_handler import CSVHandler
from paper_processor import PaperProcessor
import logging
from IPython import embed

logging.getLogger().setLevel(logging.DEBUG)
papers = list(pubmed.PubMedFetcher("h**o sapiens", num_of_documents=2116).papers.values())
PaperProcessor.add_journal_if(papers)
CSVHandler.write_list_to_csv("test.csv", papers)
Beispiel #3
0
    def evaluate(self, query, scaling_factor):
        m = hashlib.md5(str.encode(query+str(self.num_of_papers)))
        digest = m.hexdigest()
        filename = "tmp/"+digest
        if os.path.exists(filename):
            with open(filename, 'rb') as f:
                pubmed = pickle.load(f)
        else:
            pubmed = PaperProcessor(query, num_of_documents=self.num_of_papers, postprocessing=False)
            pubmed.add_missing_info()
            with open(filename, 'wb') as f:
                pickle.dump(pubmed, f)

        # pubmed.add_missing_info()

        # c = 0
        # for value in pubmed.papers.values():
        #     if value['Journal_IF'] == 0:
        #         c += 1
        #
        # print(c)

        papers = np.array(list(pubmed.papers.values()))
        logging.info("Got {} papers for query \"{}\", scaling_factor={}".format(len(papers), query, scaling_factor))

        all_time_result = 0

        times = 1
        for i in range(1, times+1):
            overall_diff = 0
            kf = KFold(len(papers), 5, shuffle=True)
            for train, test in kf:
                train_set = papers[train]
                test_set = papers[test]

                for train_sample in train_set:
                    score = len(papers) - train_sample["Ranking"] + 1
                    # Gaussian noise
                    # Standard Variation: Score/5
                    score = 1 if score == 0 else abs(score)
                    noise = np.random.normal(0, score / 5)

                    score *= scaling_factor
                    if self.noise:
                        score += noise

                    train_sample["Score"] = score

                # for test_sample in test_set:
                #     test_sample["Score"] = np.random.rand()

                self.model(train_set, test_set, query)

                test_set = list(test_set)
                test_size = len(test_set)
                test_set.sort(key = lambda x: x["Ranking"])
                for idx, test_sample in enumerate(test_set):
                    test_sample["Correct_Ranking"] = idx

                total_diff = 0
                test_set.sort(key = lambda x: x["Score"])
                test_set.reverse()
                # logging.info("Test size: {}".format(test_size))
                for idx, test_sample in enumerate(test_set):
                    total_diff += abs(test_sample["Correct_Ranking"] - idx) / test_size
                ave_diff = total_diff / test_size
                overall_diff += ave_diff / 5

            all_time_result += overall_diff / times

            # logging.info("{}th trial, Average Difference: {}".format(i, overall_diff))

        # logging.info("For all trials, Average Difference: {}".format(all_time_result))
        return all_time_result, len(papers)