Exemple #1
0
    def test_makePredictions(self):
        print("INFO: Running makePredictions tests")
        log_path = self.log_path + 'make_predictions.log'
        with open(log_path, 'w'):
            pass
        author_processor = AuthorDisambiguation(
            papers=self.test_papers,
            id_to_name=self.id_to_name,
            compare_args=self.compare_authors_args,
            log_path=log_path,
            name_similarity_cutoff=.95,
            sim_overrides=True,
            model_path=os.getcwd(),
            model_name="SoftVoting")

        test_target = ["D17-1207", "yang-liu-ict"]
        test = [["C10-2059", "yajuan-lu"], ["P16-1159", "yong-cheng"],
                ["P09-2066", "yang-liu-icsi"], ["D14-1076", "yang-liu-icsi"],
                ["D15-1210", "yang-liu-ict"], ["P16-1159", "yang-liu-ict"]]
        info_dict = {
            test_target[0] + " " + test_target[1]:
            getAuthorInfo([self.test_papers[test_target[0]],
                           test_target[1]])[1]
        }
        pairs = []
        for p, n in test:
            info_dict[p + " " + n] = getAuthorInfo([self.test_papers[p], n])[1]
            pairs.append([
                " ".join([*test_target, p, n]),
                info_dict[" ".join(test_target)], info_dict[p + " " + n]
            ])
        comparator = CompareAuthors(**self.compare_authors_args)
        key, res = author_processor._compareAuthors(
            [comparator, " ".join(test_target), pairs])
        test_compare_results = {key: res}
        consolidated = author_processor._consolidateResults(
            test_compare_results)
        predictions, probabilities = author_processor._makePredictions(
            consolidated)
        for k, info in predictions.items():
            self.assertTrue(k in probabilities)
            for a, predict in info.items():
                # self.assertTrue( a in probabilities[k])
                if a == "yang-liu-icsi" or a == "yang-liu-ict":
                    self.assertEqual(2, len(predict))
                    # self.assertEqual(2, len(probabilities[k][a]))
                else:
                    self.assertEqual(1, len(predict))
Exemple #2
0
 def test__getAuthorInfo(self):
     log_path = self.log_path + 'get_author_info.log'
     with open(log_path, 'w'):
         pass
     pair_creator = CreateTrainingData(self.short_papers,
                                       self.incomplete,
                                       exclude=["yang-feng"],
                                       **self.default_args,
                                       log_path=log_path)
     total, tasks, ignored, excluded = pair_creator._populateConstants()
     expected_total = 0
     for k, info in self.short_papers.items():
         if k in self.incomplete:
             continue
         for a in info.affiliations.keys():
             if a == "yang-feng":
                 continue
             expected_total += 1
     self.checkIgnored(ignored)
     self.assertEqual(len(excluded), 1)
     self.assertEqual(excluded, [("P09-1065", "yang-feng")])
     results = []
     self.assertEqual(len(tasks), expected_total)
     for i in tasks:
         pair_key, res = getAuthorInfo(i)
         if pair_key in self.test_auth_info:
             self.compareInfoDict(res, self.test_auth_info[pair_key])
         results.append((pair_key, res))
     self.assertEqual(len(results), expected_total)
    def _makeAmbiguousPairs(self, ambiguous_papers, check_authors, authors_to_get):
        printLogToConsole(self.console_log_level, "Creating pairs for ambiguous authors", logging.INFO)
        self.logger.info("Creating pairs for ambiguous authors")

        known_author_info, error_authors, error_papers = self._getAuthorInfos(authors_to_get)
        if error_authors > 0:
            self.logger.warning("{} errors getting known author infos".format(error_authors))
        if error_papers > 0:
            self.logger.warning("{} errors getting known author papers".format(error_papers))

        self.logger.debug("{} known papers".format(len(known_author_info)))
        self.logger.debug("{} ambiguous author ids".format(len(check_authors)))
        results = defaultdict(list)
        excluded = defaultdict(list)
        for a in ambiguous_papers.keys():
            printLogToConsole(self.console_log_level, "Creating pairs for {}".format(a), logging.INFO)
            self.logger.info("Creating pairs for {}".format(a))
            self.logger.debug("{} has {} papers".format(a, len(ambiguous_papers[a])))
            self.logger.debug("{} has {} to check against".format(a, len(check_authors[a])))
            self.logger.debug("{} has {} total possible pairs".format(a, len(ambiguous_papers) * len(check_authors[a])))

            known_to_use = [[" ".join(x), known_author_info[" ".join(x)]] for x in check_authors[a]]
            for p in ambiguous_papers[a]:
                ambiguous_paper_info = getAuthorInfo([self.papers[p], a])
                pairs_to_use, pairs_excluded = self._makePairs(ambiguous_paper_info, known_to_use)
                self.logger.debug("{} {} has {} pairs".format(p, a, len(pairs_to_use)))
                self.logger.debug("{} {} has {} excluded".format(p, a, len(pairs_excluded)))
                results[" ".join([p, a])] = pairs_to_use

                excluded[" ".join([p, a])] = [x[0] for x in pairs_excluded]

        return results, excluded
    def _getAuthorInfos(self, authors) -> (dict, int, int):
        out = {}
        printLogToConsole(self.console_log_level, "Getting author info for specified authors", logging.INFO)
        self.logger.info("Getting author info for specified authors")
        self.logger.debug("authors={}".format(authors))
        error_authors = 0
        error_papers = 0
        pbar = tqdm(total=len(authors), file=sys.stdout)
        for a in authors:
            if a not in self.author_papers:
                pbar.update()
                self.logger.warning("{} is not in self.author_papers".format(a))
                error_authors += 1
                continue
            for p in self.author_papers[a]:
                if p not in self.papers:
                    self.logger.debug("{} not in self.papers".format(p))
                    error_papers += 1

                    continue
                auth_key, auth_info = getAuthorInfo([self.papers[p], a])
                out[auth_key] = auth_info
            pbar.update()
        pbar.close()
        self.logger.debug("len(out)={}".format(len(out)))
        self.logger.debug("error_authors={}".format(error_authors))
        self.logger.debug("error_papers={}".format(error_papers))
        return out, error_authors, error_papers
Exemple #5
0
    def test_compareAuthors(self):
        test_target = ["D17-1207", "yang-liu-ict"]
        test = [["C10-2059", "yajuan-lu"], ["P16-1159", "yong-cheng"],
                ["P09-2066", "yang-liu-icsi"]]
        info_dict = {
            test_target[0] + " " + test_target[1]:
            getAuthorInfo([self.test_papers[test_target[0]],
                           test_target[1]])[1]
        }
        pairs = []
        for p, n in test:
            info_dict[p + " " + n] = getAuthorInfo([self.test_papers[p], n])[1]
            pairs.append([
                " ".join([*test_target, p, n]),
                info_dict[" ".join(test_target)], info_dict[p + " " + n]
            ])

        print("INFO: Running compareAuthors tests")
        log_path = self.log_path + 'compare_authors.log'
        with open(log_path, 'w'):
            pass
        author_processor = AuthorDisambiguation(
            papers=self.test_papers,
            id_to_name=self.id_to_name,
            compare_args=self.compare_authors_args,
            log_path=log_path,
            name_similarity_cutoff=.95,
            allow_authors_not_in_override=False)
        comparator = CompareAuthors(**self.compare_authors_args)
        key, res = author_processor._compareAuthors(
            [comparator, " ".join(test_target), pairs])
        self.assertEqual(" ".join(test_target), key)
        self.assertNotEqual(0, len(res))
        for k, info in info_dict.items():
            if k == " ".join(test_target):
                continue
            k_id = k.split()[1]
            self.assertTrue(k_id in res)
            self.assertEqual(1, len(res[k_id]))
            expected = comparator([
                " ".join([*test_target, k]), 0,
                info_dict[" ".join(test_target)], info
            ])[-1]
            np.testing.assert_array_equal(expected, res[k_id][0])
Exemple #6
0
 def test__getAuthorInfos(self):
     print("INFO: Running _getAuthorInfos tests")
     log_path = self.log_path + 'get_author_info.log'
     with open(log_path, 'w'):
         pass
     test_auths = ["yang-liu-ict", "luyang-liu", "bob-newman", "yang-liu"]
     test_papers = {
         "D17-1207": self.test_papers["D17-1207"],
         "C18-1172": self.test_papers["C18-1172"]
     }
     test_author_papers = {
         "yang-liu-ict": ["D17-1207"],
         "luyang-liu": ["C18-1172"],
         "bob-newman": ["A0-0000"]
     }
     author_processor = AuthorDisambiguation(
         papers=test_papers,
         author_papers=test_author_papers,
         compare_args=self.compare_authors_args,
         log_path=log_path,
         file_log_level=logging.WARNING)
     res, error_auth, error_paper = author_processor._getAuthorInfos(
         test_auths)
     self.assertEqual(1, error_auth)
     self.assertEqual(1, error_paper)
     for i, v in res.items():
         if i == "D17-1207 yang-liu-ict":
             self.compareInfoDict(
                 v,
                 getAuthorInfo(
                     [self.test_papers["D17-1207"], "yang-liu-ict"])[1])
         elif i == "C18-1172 luyang-liu":
             self.compareInfoDict(
                 v,
                 getAuthorInfo([self.test_papers["C18-1172"],
                                "luyang-liu"])[1])