コード例 #1
0
ファイル: evaluation_dep.py プロジェクト: 764664/BioSE
    def evaluate(self, query, scaling_factor):
        m = hashlib.md5(str.encode(query+str(self.num_of_papers)))
        digest = m.hexdigest()
        filename = "tmp/"+digest
        if os.path.exists(filename):
            with open(filename, 'rb') as f:
                pubmed = pickle.load(f)
        else:
            pubmed = PaperProcessor(query, num_of_documents=self.num_of_papers, postprocessing=False)
            pubmed.add_missing_info()
            with open(filename, 'wb') as f:
                pickle.dump(pubmed, f)

        # pubmed.add_missing_info()

        # c = 0
        # for value in pubmed.papers.values():
        #     if value['Journal_IF'] == 0:
        #         c += 1
        #
        # print(c)

        papers = np.array(list(pubmed.papers.values()))
        logging.info("Got {} papers for query \"{}\", scaling_factor={}".format(len(papers), query, scaling_factor))

        all_time_result = 0

        times = 1
        for i in range(1, times+1):
            overall_diff = 0
            kf = KFold(len(papers), 5, shuffle=True)
            for train, test in kf:
                train_set = papers[train]
                test_set = papers[test]

                for train_sample in train_set:
                    score = len(papers) - train_sample["Ranking"] + 1
                    # Gaussian noise
                    # Standard Variation: Score/5
                    score = 1 if score == 0 else abs(score)
                    noise = np.random.normal(0, score / 5)

                    score *= scaling_factor
                    if self.noise:
                        score += noise

                    train_sample["Score"] = score

                # for test_sample in test_set:
                #     test_sample["Score"] = np.random.rand()

                self.model(train_set, test_set, query)

                test_set = list(test_set)
                test_size = len(test_set)
                test_set.sort(key = lambda x: x["Ranking"])
                for idx, test_sample in enumerate(test_set):
                    test_sample["Correct_Ranking"] = idx

                total_diff = 0
                test_set.sort(key = lambda x: x["Score"])
                test_set.reverse()
                # logging.info("Test size: {}".format(test_size))
                for idx, test_sample in enumerate(test_set):
                    total_diff += abs(test_sample["Correct_Ranking"] - idx) / test_size
                ave_diff = total_diff / test_size
                overall_diff += ave_diff / 5

            all_time_result += overall_diff / times

            # logging.info("{}th trial, Average Difference: {}".format(i, overall_diff))

        # logging.info("For all trials, Average Difference: {}".format(all_time_result))
        return all_time_result, len(papers)