def test_makePredictions(self): print("INFO: Running makePredictions tests") log_path = self.log_path + 'make_predictions.log' with open(log_path, 'w'): pass author_processor = AuthorDisambiguation( papers=self.test_papers, id_to_name=self.id_to_name, compare_args=self.compare_authors_args, log_path=log_path, name_similarity_cutoff=.95, sim_overrides=True, model_path=os.getcwd(), model_name="SoftVoting") test_target = ["D17-1207", "yang-liu-ict"] test = [["C10-2059", "yajuan-lu"], ["P16-1159", "yong-cheng"], ["P09-2066", "yang-liu-icsi"], ["D14-1076", "yang-liu-icsi"], ["D15-1210", "yang-liu-ict"], ["P16-1159", "yang-liu-ict"]] info_dict = { test_target[0] + " " + test_target[1]: getAuthorInfo([self.test_papers[test_target[0]], test_target[1]])[1] } pairs = [] for p, n in test: info_dict[p + " " + n] = getAuthorInfo([self.test_papers[p], n])[1] pairs.append([ " ".join([*test_target, p, n]), info_dict[" ".join(test_target)], info_dict[p + " " + n] ]) comparator = CompareAuthors(**self.compare_authors_args) key, res = author_processor._compareAuthors( [comparator, " ".join(test_target), pairs]) test_compare_results = {key: res} consolidated = author_processor._consolidateResults( test_compare_results) predictions, probabilities = author_processor._makePredictions( consolidated) for k, info in predictions.items(): self.assertTrue(k in probabilities) for a, predict in info.items(): # self.assertTrue( a in probabilities[k]) if a == "yang-liu-icsi" or a == "yang-liu-ict": self.assertEqual(2, len(predict)) # self.assertEqual(2, len(probabilities[k][a])) else: self.assertEqual(1, len(predict))
def test__getAuthorInfo(self): log_path = self.log_path + 'get_author_info.log' with open(log_path, 'w'): pass pair_creator = CreateTrainingData(self.short_papers, self.incomplete, exclude=["yang-feng"], **self.default_args, log_path=log_path) total, tasks, ignored, excluded = pair_creator._populateConstants() expected_total = 0 for k, info in self.short_papers.items(): if k in self.incomplete: continue for a in info.affiliations.keys(): if a == "yang-feng": continue expected_total += 1 self.checkIgnored(ignored) self.assertEqual(len(excluded), 1) self.assertEqual(excluded, [("P09-1065", "yang-feng")]) results = [] self.assertEqual(len(tasks), expected_total) for i in tasks: pair_key, res = getAuthorInfo(i) if pair_key in self.test_auth_info: self.compareInfoDict(res, self.test_auth_info[pair_key]) results.append((pair_key, res)) self.assertEqual(len(results), expected_total)
def _makeAmbiguousPairs(self, ambiguous_papers, check_authors, authors_to_get): printLogToConsole(self.console_log_level, "Creating pairs for ambiguous authors", logging.INFO) self.logger.info("Creating pairs for ambiguous authors") known_author_info, error_authors, error_papers = self._getAuthorInfos(authors_to_get) if error_authors > 0: self.logger.warning("{} errors getting known author infos".format(error_authors)) if error_papers > 0: self.logger.warning("{} errors getting known author papers".format(error_papers)) self.logger.debug("{} known papers".format(len(known_author_info))) self.logger.debug("{} ambiguous author ids".format(len(check_authors))) results = defaultdict(list) excluded = defaultdict(list) for a in ambiguous_papers.keys(): printLogToConsole(self.console_log_level, "Creating pairs for {}".format(a), logging.INFO) self.logger.info("Creating pairs for {}".format(a)) self.logger.debug("{} has {} papers".format(a, len(ambiguous_papers[a]))) self.logger.debug("{} has {} to check against".format(a, len(check_authors[a]))) self.logger.debug("{} has {} total possible pairs".format(a, len(ambiguous_papers) * len(check_authors[a]))) known_to_use = [[" ".join(x), known_author_info[" ".join(x)]] for x in check_authors[a]] for p in ambiguous_papers[a]: ambiguous_paper_info = getAuthorInfo([self.papers[p], a]) pairs_to_use, pairs_excluded = self._makePairs(ambiguous_paper_info, known_to_use) self.logger.debug("{} {} has {} pairs".format(p, a, len(pairs_to_use))) self.logger.debug("{} {} has {} excluded".format(p, a, len(pairs_excluded))) results[" ".join([p, a])] = pairs_to_use excluded[" ".join([p, a])] = [x[0] for x in pairs_excluded] return results, excluded
def _getAuthorInfos(self, authors) -> (dict, int, int): out = {} printLogToConsole(self.console_log_level, "Getting author info for specified authors", logging.INFO) self.logger.info("Getting author info for specified authors") self.logger.debug("authors={}".format(authors)) error_authors = 0 error_papers = 0 pbar = tqdm(total=len(authors), file=sys.stdout) for a in authors: if a not in self.author_papers: pbar.update() self.logger.warning("{} is not in self.author_papers".format(a)) error_authors += 1 continue for p in self.author_papers[a]: if p not in self.papers: self.logger.debug("{} not in self.papers".format(p)) error_papers += 1 continue auth_key, auth_info = getAuthorInfo([self.papers[p], a]) out[auth_key] = auth_info pbar.update() pbar.close() self.logger.debug("len(out)={}".format(len(out))) self.logger.debug("error_authors={}".format(error_authors)) self.logger.debug("error_papers={}".format(error_papers)) return out, error_authors, error_papers
def test_compareAuthors(self): test_target = ["D17-1207", "yang-liu-ict"] test = [["C10-2059", "yajuan-lu"], ["P16-1159", "yong-cheng"], ["P09-2066", "yang-liu-icsi"]] info_dict = { test_target[0] + " " + test_target[1]: getAuthorInfo([self.test_papers[test_target[0]], test_target[1]])[1] } pairs = [] for p, n in test: info_dict[p + " " + n] = getAuthorInfo([self.test_papers[p], n])[1] pairs.append([ " ".join([*test_target, p, n]), info_dict[" ".join(test_target)], info_dict[p + " " + n] ]) print("INFO: Running compareAuthors tests") log_path = self.log_path + 'compare_authors.log' with open(log_path, 'w'): pass author_processor = AuthorDisambiguation( papers=self.test_papers, id_to_name=self.id_to_name, compare_args=self.compare_authors_args, log_path=log_path, name_similarity_cutoff=.95, allow_authors_not_in_override=False) comparator = CompareAuthors(**self.compare_authors_args) key, res = author_processor._compareAuthors( [comparator, " ".join(test_target), pairs]) self.assertEqual(" ".join(test_target), key) self.assertNotEqual(0, len(res)) for k, info in info_dict.items(): if k == " ".join(test_target): continue k_id = k.split()[1] self.assertTrue(k_id in res) self.assertEqual(1, len(res[k_id])) expected = comparator([ " ".join([*test_target, k]), 0, info_dict[" ".join(test_target)], info ])[-1] np.testing.assert_array_equal(expected, res[k_id][0])
def test__getAuthorInfos(self): print("INFO: Running _getAuthorInfos tests") log_path = self.log_path + 'get_author_info.log' with open(log_path, 'w'): pass test_auths = ["yang-liu-ict", "luyang-liu", "bob-newman", "yang-liu"] test_papers = { "D17-1207": self.test_papers["D17-1207"], "C18-1172": self.test_papers["C18-1172"] } test_author_papers = { "yang-liu-ict": ["D17-1207"], "luyang-liu": ["C18-1172"], "bob-newman": ["A0-0000"] } author_processor = AuthorDisambiguation( papers=test_papers, author_papers=test_author_papers, compare_args=self.compare_authors_args, log_path=log_path, file_log_level=logging.WARNING) res, error_auth, error_paper = author_processor._getAuthorInfos( test_auths) self.assertEqual(1, error_auth) self.assertEqual(1, error_paper) for i, v in res.items(): if i == "D17-1207 yang-liu-ict": self.compareInfoDict( v, getAuthorInfo( [self.test_papers["D17-1207"], "yang-liu-ict"])[1]) elif i == "C18-1172 luyang-liu": self.compareInfoDict( v, getAuthorInfo([self.test_papers["C18-1172"], "luyang-liu"])[1])