def test_when_pencil_write_at_is_passed_a_string_and_an_index_greater_than_paper_text_length_it_adds_text_to_end(self): paper = Paper() paper.text = "An apple a day keeps the doctor away" self.pencil.write_at(paper, ", don't you know.", 40) self.assertEqual(paper.text, "An apple a day keeps the doctor away, don't you know.")
def test_when_pencil_write_at_is_passed_a_string_and_an_index_that_is_in_bounds_but_string_len_plus_index_is_greater_than_paper_text_length_it_overwrites_and_addes_on(self): paper = Paper() paper.text = "An apple a day keeps the doctor away" self.pencil.write_at(paper, "from coming around.", 32) self.assertEqual(paper.text, "An apple a day keeps the doctor @@@@ coming around.")
def test_when_pencil_write_at_is_passed_a_string_and_an_index_it_will_write_the_string_on_the_paper_at_that_index_and_overwrite_filled_spaces_with_symbol(self): paper = Paper() paper.text = "An a day keeps the doctor away" self.pencil.write_at(paper, "artichoke", 3) self.assertEqual(paper.text, "An artich@k@ay keeps the doctor away")
def test_when_pencil_write_at_is_passed_a_string_and_an_index_it_will_write_the_string_on_the_paper_at_that_index_on_white_space(self): paper = Paper() paper.text = "An a day keeps the doctor away" self.pencil.write_at(paper, "onion", 3) self.assertEqual(paper.text, "An onion a day keeps the doctor away")
def __init__(self, papers, id_to_name, author_papers, treat_id_different_people=False, console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None, raise_error=False, skip_error_papers=False, one_target_per_paper=False, save_data=False, ext_directory=False, save_path=None, cores=4, remove_all_papers=False): if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' if not log_path: log_path = os.getcwd() + "/logs/disambiguation.log" self.logger = createLogger("disambiguator", log_path, log_format, console_log_level, file_log_level) self.console_log_level = console_log_level self.treat_id_different_people = treat_id_different_people self.papers = {} for k, p in papers.items(): if isinstance(p, Paper): self.papers = papers break self.papers[k] = Paper(**p) self.id_to_name = deepcopy(id_to_name) self.author_papers = deepcopy(author_papers) self.author_id_suffix = Counter() self.raise_error = raise_error self.error_papers = set() self.new_papers = {} self.new_author_papers = defaultdict(list) self.new_id_to_name = {} self.old_ids = set() self.skip_errors = skip_error_papers self.one_per_paper = one_target_per_paper self.save_data = save_data self.ext_directory = ext_directory self.save_path = save_path self.cores = cores self.remove_all_papers = remove_all_papers
def _process_results(self, url: str) -> List[Dict]: res = requests.get(url).content soup = BeautifulSoup(res, 'html.parser') papers = [Paper(entry=entry) for entry in soup.findAll('entry')] return [p.get_json() for p in papers]
def setUp(self) -> None: self.config_raw = json.load(open("config.json")) self.config_raw["log path"] = "/tests/targetCreatorTests/logs/" self.config_raw["raise error"] = False self.config_raw["treat id different people"] = True self.config_raw["skip error papers"] = True self.log_path = self.config_raw["log path"] self.test_authors = [ "hua-wu", "yun-chen", "victor-ok-li", "linfeng-song", "peng-li", "tatsuya-izuha", "yun-huang", "xuan-jing-huang", "qiang-wang" ] self.test_papers = [ "W18-5212", "C18-1314", "P16-1159", "P17-1176", "W11-1911", "C14-1179", "P07-1089" ] self.test_multiple_auth = ["P17-1776", "C14-1179"] self.test_non_parsed = ["S19-2016"] # config = ConfigHandler(self.config_raw,"setup_test_target_creator") # data = loadData([ "id_to_name", "author_papers"],config.logger,config) self.parsed_raw = json.load( open(os.getcwd() + "/tests/authorDisambiguationTests/test_papers.json")) self.papers = {x: Paper(**v) for x, v in self.parsed_raw.items()} self.author_papers = {} self.id_to_name = {} for p, v in self.papers.items(): for a in v.affiliations.keys(): if a not in self.author_papers: self.author_papers[a] = [] self.author_papers[a].append(p) for a, n in v.authors.items(): self.id_to_name[a] = n
def test_when_pencil_write_is_passed_a_paper_instance_and_a_string_to_write_it_will_degrade_the_pencil_durability_and_write_the_resulting_string_on_the_paper(self): paper = Paper() self.assertEqual(self.pencil.current_tip_durability, 1000) self.pencil.write(paper, "This is a string to be written.") self.assertEqual(paper.text, "This is a string to be written.") self.assertEqual(self.pencil.current_tip_durability, 974)
def setUp(self): self.log_path = os.getcwd() + '/tests/pdfParserTests/logs/' self.config = json.load( open("/home/gabe/Desktop/research-main/config.json")) test_paper_path = os.getcwd() + "/tests/pdfParserTests/" data_path = os.getcwd() + "/data" self.test_paper1_root = etree.XML( open(test_paper_path + "test_1.tei.xml", "rb").read()) self.test_paper1_xml = open(test_paper_path + "test_1.tei.xml", "rb").read() self.test1_key = "Q13-1004" self.test_paper2_root = etree.XML( open(test_paper_path + "test_2.tei.xml", "rb").read()) self.test_paper2_xml = open(test_paper_path + "test_2.tei.xml", "rb").read() self.test2_key = "W19-4450" self.test_paper3_root = etree.XML( open(test_paper_path + "test_3.tei.xml", "rb").read()) self.test_paper4_root = etree.XML( open(test_paper_path + "test_4.tei.xml", "rb").read()) self.aliases = json.load(open(data_path + "/json/aliases.json")) papers_tmp = json.load(open(data_path + "/json/acl_papers.json")) self.papers = {x: Paper(**v) for x, v in papers_tmp.items()} self.id_to_name = json.load(open(data_path + "/json/id_to_name.json")) self.same_names = [ x.strip() for x in open(data_path + "/txt/same_names.txt").readlines() ] self.parser_args = { "aliases": self.aliases, "id_to_name": self.id_to_name, "same_names": self.same_names, "sim_cutoff": .75 } self.wrapper_args = { "aliases": self.aliases, "papers": self.papers, "id_to_name": self.id_to_name, "same_names": self.same_names } self.data_path = os.getcwd() + "/data/"
def weekly_papers(event, context): print(event) print(context) linebot = LineBotApi(CHANNEL_ACCESS_TOKEN) papers = es.random_search(size=5) papers = [Paper(json=p) for p in papers] contents = {'type': 'carousel', 'contents': [ p.get_flex_contents() for p in papers]} try: linebot.push_message(USER_ID, FlexSendMessage( alt_text='Weekly Papers', contents=contents )) except LineBotApiError as e: print(e)
def setUp(self) -> None: test_papers = json.load( open(os.getcwd() + "/tests/authorDisambiguationTests/test_papers.json")) self.test_papers = {} for k, p in test_papers.items(): self.test_papers[k] = Paper(**p) self.config = json.load(open(os.getcwd() + "/config.json")) data_path = os.getcwd() + "/data" # papers_dict = json.load(open(data_path + "/json/parsed_papers.json")) self.incomplete = [ x.strip() for x in open(data_path + "/txt/incomplete_papers.txt").readlines() if x != "\n" ] self.papers = {} # for k,p in papers_dict.items(): # self.papers[k] = Paper(**p) self.author_papers = json.load( open(data_path + "/json/author_papers.json")) self.log_path = os.getcwd() + '/tests/authorDisambiguationTests/logs/' org_corpus = [[stemmer.stem(w) for w in x.strip().split()] for x in open(data_path + "/txt/org_corpus.txt").readlines()] department_corpus = [[ stemmer.stem(w) for w in x.strip().split() ] for x in open(data_path + "/txt/department_corpus.txt").readlines()] self.incomplete = [ x.strip() for x in open(data_path + "/txt/incomplete_papers.txt").readlines() ] self.compare_authors_args = { "company_corpus": org_corpus, "department_corpus": department_corpus, "threshold": .4, "str_algorithm": ["jaro", "similarity"] } self.id_to_name = json.load(open(data_path + "/json/id_to_name.json"))
def webhook(event, context): # receive user input linebot = LineBotApi(CHANNEL_ACCESS_TOKEN) # handler = WebhookHandler(CHANNEL_SECRET) # msg = json.loads(event['body']) # {"events":[ # {"type":"message","replyToken":"a5d6dadb84a346428bc53ea9ce656cea", "message":{"type":"text","id":"13044610237128","text":"yo"}} # ]} events = json.loads(event['body'])['events'] for e in events: reply_token = e['replyToken'] text = e['message']['text'].strip() papers = es.search(text, ['title', 'abstract']) papers = [Paper(json=p) for p in papers] if papers: contents = {'type': 'carousel', 'contents': [ p.get_flex_contents() for p in papers]} linebot.reply_message(reply_token, FlexSendMessage( alt_text=f'papers for {text}', contents=contents )) else: linebot.reply_message(reply_token, TextSendMessage(text='Results Not Found')) response = { 'statusCode': 200, 'body': json.dumps({'message': 'ok'}) } return response
def test_when_print_text_is_called_it_prints_text_variable_to_stdout( self, mock_output): paper = Paper() paper.write("Hello World!") paper.print_text() self.assertEqual(mock_output.getvalue(), "Hello World!\n")
def test_when_write_is_passed_string_it_adds_string_to_text_variable_in_the_paper_instance_with_existing_text( self): paper = Paper() paper.text = "Hello " paper.write("World!") self.assertEqual(paper.text, "Hello World!")
def test_updatePapers(self): print("INFO: Testing updatePapers") with open(os.getcwd() + self.log_path + "update_papers.log", "w") as f: pass config = ConfigHandler(self.config_raw, "update_papers") author_papers_copy = deepcopy(self.author_papers) papers_copy = {x: Paper(**v.asDict()) for x, v in self.papers.items()} tests = [ ["qiang-wang", "qiang-wang1", None], # No papers passed ['hua-wu', "hua-wu1", ['P16-1159']], # Error papers ['yun-chen', "yun-chen1", ['P16-1159']], # Not in paper ['yun-chen', "yun-chen1", ['P17-1176']], ['victor-ok-li', "victor-ok-li1", ['P17-1176']], # Paper already done ["xuan-jing-huang", "fail-test", ["P19-1642"]], ['fail-test', "yun-huang1", ['S19-2016']], ] target_creator = TargetCreator(papers_copy, self.id_to_name, author_papers_copy, **config["TargetCreator"]) target_creator.one_per_paper = False target_creator.error_papers = {"P16-1159"} a = tests[0] target_creator._updatePapers(*a) self.assertEqual(1, len(target_creator.new_papers)) self.assertEqual(1, len(target_creator.new_author_papers)) self.assertTrue("qiang-wang1" in target_creator.new_author_papers) self.assertTrue("W19-4416" in target_creator.new_papers) self.assertTrue( "qiang-wang1" in target_creator.new_papers["W19-4416"].authors) self.assertTrue("qiang-wang1" in target_creator.new_papers["W19-4416"].affiliations) b = tests[1] target_creator._updatePapers(*b) self.assertEqual(1, len(target_creator.new_papers)) self.assertEqual(1, len(target_creator.new_author_papers)) c = tests[2] target_creator._updatePapers(*c) self.assertEqual(1, len(target_creator.new_papers)) self.assertEqual(1, len(target_creator.new_author_papers)) d = tests[3] target_creator._updatePapers(*d) self.assertEqual(2, len(target_creator.new_papers)) self.assertEqual(2, len(target_creator.new_author_papers)) self.assertTrue( "qiang-wang1" in target_creator.new_papers["W19-4416"].authors) self.assertTrue("qiang-wang1" in target_creator.new_papers["W19-4416"].affiliations) self.assertTrue( "yun-chen1" in target_creator.new_papers["P17-1176"].authors) self.assertTrue( "yun-chen1" in target_creator.new_papers["P17-1176"].affiliations) e = tests[4] target_creator._updatePapers(*e) self.assertEqual(2, len(target_creator.new_papers)) self.assertEqual(3, len(target_creator.new_author_papers)) self.assertTrue( "yun-chen1" in target_creator.new_papers["P17-1176"].authors) self.assertTrue( "yun-chen1" in target_creator.new_papers["P17-1176"].affiliations) self.assertTrue( "victor-ok-li1" in target_creator.new_papers["P17-1176"].authors) self.assertTrue("victor-ok-li1" in target_creator.new_papers["P17-1176"].affiliations) f = tests[5] target_creator._updatePapers(*f) self.assertEqual(2, len(target_creator.new_papers)) self.assertEqual(3, len(target_creator.new_author_papers)) g = tests[6] target_creator._updatePapers(*g) self.assertEqual(2, len(target_creator.new_papers)) self.assertEqual(3, len(target_creator.new_author_papers))
def test_when_pencil_write_at_is_passed_a_string_and_an_index_less_than_zero_raises_index_error(self): paper = Paper() paper.text = "An apple a day keeps the doctor away" self.assertRaises(IndexError, lambda: self.pencil.write_at(paper, "Remember, a", -10))
def setUp(self): self.paper = Paper() self.pencil = pencil_factory.get_no2_hb() self.initial_point_durability = self.pencil.point_durability
def test_paper_should_initialize_last_erased_field(self): paper = Paper() self.assertEqual(paper.last_erased, -1)
def setUp(self): self.paper = Paper() self.eraser = Eraser(durability=1000)
def setUp(self): self.initial_eraser_durability = 1000 self.paper = Paper() self.eraser = Eraser(durability=self.initial_eraser_durability)
def test_paper_text_should_be_set_with_text_property(self): paper = Paper() paper.text = 'abc' self.assertEqual(paper.text, 'abc')
def test_can_initialize_paper_with_text(self): paper = Paper(initial_text='Hello Fellow') self.assertEqual(paper.text, 'Hello Fellow')
def setUp(self): self.config = json.load( open("/home/gabe/Desktop/research-main/config.json")) data_path = "/home/gabe/Desktop/research-main/data" papers_dict = json.load(open(data_path + "/json/parsed_papers.json")) self.test_auth_info = json.load( open( "/home/gabe/Desktop/research-main/tests/createPairTests/test_papers.json" )) self.incomplete = [ x.strip() for x in open(data_path + "/txt/incomplete_papers.txt").readlines() ] self.test_papers = { "N12-1057": { "owen-rambow": "Owen Rambow", "mona-diab": "Mona Diab", "vinodkumar-prabhakaran": "Vinodkumar Prabhakaran" }, "N19-1050": { "shima-asaadi": "Shima Asaadi", "saif-mohammad": "Saif Mohammad", "svetlana-kiritchenko": "Svetlana Kiritchenko" }, "C16-1050": { "elaheh-shafieibavani": "Elaheh ShafieiBavani", "mohammad-ebrahimi": "Mohammad Ebrahimi", "raymond-wong": "Raymond Wong", "fang-chen": "Fang Chen" }, "S19-2016": { "tobias-putz": "Tobias P\u00fctz", "kevin-glocker": "Kevin Glocker" }, "P19-1642": { "iacer-calixto": "Iacer Calixto", "miguel-rios": "Miguel Rios", "wilker-aziz": "Wilker Aziz" }, "W19-4022": { "jungyeul-park": "Jungyeul Park", "francis-tyers": "Francis Tyers" }, "Q19-1001": { "dan-roth": "Dan Roth", "alla-rozovskaya": "Alla Rozovskaya" }, "P15-1150": { "christopher-d-manning": "Christopher D. Manning", "kai-sheng-tai": "Kai Sheng Tai", "richard-socher": "Richard Socher" }, 'P17-1139': { 'yang-liu-ict': 'Yang Liu', 'maosong-sun': 'Maosong Sun', 'jiacheng-zhang': 'Jiacheng Zhang', 'huanbo-luan': 'Huanbo Luan', 'jingfang-xu': 'Jingfang Xu' }, 'C10-2136': { 'yang-liu-ict': 'Yang Liu', 'yajuan-lu': 'Yajuan Lv', 'qun-liu': 'Qun Liu', 'jinsong-su': 'Jinsong Su', 'haitao-mi': 'Haitao Mi', 'hongmei-zhao': 'Hongmei Zhao' }, 'D18-1041': { 'yang-liu-ict': 'Yang Liu', 'jinsong-su': 'Jinsong Su', 'jiali-zeng': 'Jiali Zeng', 'huating-wen': 'Huating Wen', 'jun-xie': 'Jun Xie', 'yongjing-yin': 'Yongjing Yin', 'jianqiang-zhao': 'Jianqiang Zhao' }, 'Q18-1029': { 'yang-liu-ict': 'Yang Liu', 'zhaopeng-tu': 'Zhaopeng Tu', 'shuming-shi': 'Shuming Shi', 'tong-zhang': 'Tong Zhang' }, 'P17-1176': { 'yang-liu-ict': 'Yang Liu', 'victor-ok-li': 'Victor O.K. Li', 'yun-chen': 'Yun Chen', 'yong-cheng': 'Yong Cheng' }, 'P09-1065': { 'yang-liu-ict': 'Yang Liu', 'qun-liu': 'Qun Liu', 'haitao-mi': 'Haitao Mi', 'yang-feng': 'Yang Feng' }, 'P13-1084': { 'yang-liu-ict': 'Yang Liu', 'jun-zhao': 'Jun Zhao', 'guangyou-zhou': 'Guangyou Zhou', 'shizhu-he': 'Shizhu He', 'fang-liu': 'Fang Liu' } } self.test_keys = [] for k, info in self.test_papers.items(): for a in info.keys(): if a == "yang-feng": continue self.test_keys.append(k + " " + a) self.papers = {} self.short_papers = {} for k, info in papers_dict.items(): if k in self.test_papers or k in self.incomplete: self.short_papers[k] = Paper(**papers_dict[k]) self.papers[k] = Paper(**papers_dict[k]) for a, aff_info in info["affiliations"].items(): if a == "yang-feng": continue if "type" in aff_info and len(aff_info["type"]) > 2: print(k) print(info) break self.default_args = dict(author_cutoff=0, drop_null_authors=False) self.log_path = os.getcwd() + '/createPairTests/logs/'
def test_should_instantiate_paper_with_empty_text_field(self): paper = Paper() self.assertIsInstance(paper, Paper) self.assertEqual(paper.text, '')
print("INFO: Starting Create Data") gc.collect() config_raw = json.load(open("config.json")) config = ConfigHandler(config_raw, "disambiguate", raise_error_unknown=True) data = loadData([ "department_corpus", "incomplete_papers", "org_corpus", "conflicts", "parsed_papers", "same_names", "test_special_keys", "author_papers", "id_to_name" ], config.logger, config) author_papers = data["author_papers"] id_to_name = data["id_to_name"] same_names = data["same_names"] parsed = data["parsed_papers"] parsed = {x: Paper(**info) for x, info in parsed.items()} org_corpus = data["org_corpus"] department_corpus = data["department_corpus"] incomplete = data["incomplete_papers"] special_keys = data["test_special_keys"] input_handler = InputHandler(parsed, author_papers, id_to_name, **config["InputHandler"]) # input_handler.handleUserInput() input_handler.targets = [ "francisco-m-couto1", "qin-lu1", "manuel-carlos-diaz-galiano1", "luis-nieto-pina1", "yang-liu", "luciano-del-corro1", "izzeddin-gur1",
def setUp(self): self.pencil = pencil_factory.get_no2_hb() self.paper = Paper()
def __init__(self, papers=None, author_papers=None, compare_args=None, id_to_name=None, console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None, save_data=False, ext_directory=False, save_path=None, threshold=.2, name_similarity_cutoff=.92, str_algorithm="jaro-similarity", model=None, model_name="VC1", model_path=None, create_new_author=False, compare_cutoff=3, tie_breaker="max", cores=4, DEBUG_MODE=False, sim_overrides=False, allow_authors_not_in_override=True, same_paper_diff_people=True, use_probabilities=False): if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' if not log_path: log_path = os.getcwd() + "/logs/disambiguation.log" self.logger = createLogger("author_disambiguation", log_path, log_format, console_log_level, file_log_level) self.console_log_level = console_log_level self.model = model self.model_name = model_name if self.model is None: if not model_path: model_path = os.getcwd() self.model = pickle.load(open("{}/models/{}/model.pickle".format(model_path, model_name), "rb")) try: if self.model.voting == "hard" and use_probabilities: self.logger.warning("hard voting does not support probabilities") self.use_probabilities = False else: self.use_probabilities = use_probabilities except Exception as e: self.logger.debug("model does not have voting") self.use_probabilities = False if not DEBUG_MODE: # Argument validation if compare_args and not isinstance(compare_args, dict): self.logger.error("passed compare_args is not valid") self.logger.exception(TypeError("compare_args is not a dict")) raise TypeError("compare_args is not a dict") elif not compare_args: self.logger.error("passed compare_args is not valid") self.logger.exception(ValueError("compare_args is None")) raise ValueError("compare_args is None") else: self.compare_args = compare_args if author_papers and (not isinstance(author_papers, dict) and not isinstance(author_papers, defaultdict)): self.logger.error("passed author_papers is not valid") self.logger.error("type is {}".format(type(author_papers))) self.logger.exception(TypeError("author_papers is not a dict")) raise TypeError("author_papers is not a dict") elif not author_papers: author_papers, status, error_msg = self._findData("author_papers.json") if status != 0: self.logger.error( "passed author_papers is not valid and could not find the file author_papers.json") self.logger.error("self._findData(\"author_papers.json\") returned error {}".format(error_msg)) self.logger.exception(ValueError("No valid author_papers found")) raise ValueError("No valid author_papers found") else: self.author_papers = deepcopy(author_papers) else: self.author_papers = deepcopy(author_papers) if papers and not isinstance(papers, dict): self.logger.error("passed papers is not valid") self.logger.exception(TypeError("papers is not a dict")) raise TypeError("papers is not a dict") elif not papers: papers, status, error_msg = self._findData("parsed_papers.json") if status != 0: self.logger.error("passed papers is not valid and could not find the file parsed_papers.json") self.logger.error("self._findData(\"parsed_papers.json\") returned error {}".format(error_msg)) self.logger.exception(ValueError("No valid parsed_papers found")) raise ValueError("No valid parsed_papers found") else: if len(papers) == 0: self.logger.exception(ValueError("Found papers is empty")) raise ValueError("Found papers is empty") self.logger.debug("Converting papers from dict to Paper object") self.papers = {} for k, info in papers.items(): self.papers[k] = Paper(**info) else: if len(papers) == 0: self.logger.exception(ValueError("Passed papers is empty")) raise ValueError("Passed papers is empty") test_key = list(papers.keys())[0] if isinstance(test_key, dict): self.papers = {} for k, info in papers.items(): try: self.papers[k] = Paper(**info) except Exception as e: self.logger.error("Exception raised when converting paper dicts to Paper") self.logger.error("k={}".format(k)) self.logger.error("info={}".format(info)) self.logger.exception(e) raise e else: self.papers = papers if id_to_name and not isinstance(id_to_name, dict): self.logger.error("passed id_to_name is not valid") self.logger.exception(TypeError("id_to_name is not a dict")) raise TypeError("id_to_name is not a dict") elif not id_to_name: id_to_name, status, error_msg = self._findData("id_to_name.json") if status != 0: self.logger.error("passed id_to_name is not valid and could not find the file parsed_papers.json") self.logger.error("self._findData(\"id_to_name.json\") returned error {}".format(error_msg)) self.logger.exception(ValueError("No valid id_to_name found")) raise ValueError("No valid id_to_name found") else: if len(id_to_name) == 0: self.logger.exception(ValueError("Found id_to_name is empty")) raise ValueError("Found id_to_name is empty") self.id_to_name = id_to_name else: if len(id_to_name) == 0: self.logger.exception(ValueError("Passed id_to_name is empty")) raise ValueError("Passed id_to_name is empty") self.id_to_name = id_to_name else: printLogToConsole(self.console_log_level, "RUNNING IN DEBUG_MODE!", logging.WARNING) self.logger.warning("Running in DEBUG_MODE") self.id_to_name = id_to_name if id_to_name else {} self.papers = papers if papers else {} self.compare_args = compare_args if compare_args else {} self.author_papers = author_papers if author_papers else {} self.compare_terms = len(CompareAuthors.compare_terms) self.save_data = save_data self.save_dir = save_path self.ext_directory = ext_directory self.threshold = threshold self.name_similarity_cutoff = name_similarity_cutoff algo_name, measure = str_algorithm.split("-") self.author_name = {x: nameFromDict(self.id_to_name[x]) for x in self.id_to_name.keys()} self.cores = cores self.str_algorithm = getAlgo(algo_name, measure) self.create_new_author = create_new_author self.compare_cutoff = compare_cutoff self.tie_breaker = tie_breaker self.sim_overrides = sim_overrides self.allow_authors_not_in_override = allow_authors_not_in_override self.same_paper_diff_people = same_paper_diff_people self.logger.debug("AuthorDisambiguation initialized with arguments:") self.logger.debug("\tcompare_args={}".format(list(self.compare_args.keys()))) self.logger.debug("\talgorithm={}".format(algo_name)) self.logger.debug("\tmeasure={}".format(measure)) self.logger.debug("\tthreshold={}".format(threshold)) self.logger.debug("\tname_similarity_cutoff={}".format(name_similarity_cutoff)) self.logger.debug("\tunique authors={}".format(len(self.author_papers))) self.logger.debug("\tcompare_cutoff={}".format(self.compare_cutoff)) self.logger.debug("\ttie_breaker={}".format(self.tie_breaker)) self.logger.debug("\tsim_overrides={}".format(self.sim_overrides)) self.logger.debug("\tsame_paper_diff_people={}".format(self.same_paper_diff_people)) self.logger.debug("\tuse_probabilities={}".format(self.use_probabilities)) if self.compare_cutoff != 3: self.logger.warning("Non-default value for compare_cutoff, currently this is not implemented")