def _getAuthorInfo(self, a): self.logger.debug("Getting info for {}".format(a)) name = cleanName( remove_weird_notes.sub(" ", nameFromDict(self.id_to_name[a])).replace( " ", " ")).replace(" ", " ") printLogToConsole(self.console_log_level, "id={}".format(a), logging.INFO, logger=self.logger) printLogToConsole(self.console_log_level, "name={}".format(name), logging.INFO, logger=self.logger) printLogToConsole(self.console_log_level, "Papers for {}:".format(a), logging.INFO, logger=self.logger) for p in self.author_papers[a]: if p not in self.papers: continue try: title = self.papers[p].title except: title = self.papers[p]["title"] printLogToConsole(self.console_log_level, "\t{}\t{}".format(p, title), logging.INFO, logger=self.logger) printLogToConsole(self.console_log_level, "{} Author(s) have this name".format( len(self.names[name])), logging.INFO, logger=self.logger)
def _genAuthorOverride(self, a): self.logger.debug("Received clear override command".format(a)) if a in self.override_authors: printLogToConsole( self.console_log_level, "{} already has authors to compare with".format(a), logging.INFO, logger=self.logger) return elif a not in self.targets: printLogToConsole(self.console_log_level, "{} is not a target".format(a), logging.INFO, logger=self.logger) return name = cleanName( remove_weird_notes.sub(" ", nameFromDict(self.id_to_name[a])).replace( " ", " ")).replace(" ", " ") print("INFO: Other authors with the same name:") for other_a in self.names[name]: if other_a != a: print("INFO: {}".format(other_a)) if len(self.names[name]) == 1: printLogToConsole( self.console_log_level, "{} only has {}, will not add authors to compare with".format( name, a), logging.INFO, logger=self.logger) else: self.override_authors[a] = [x for x in self.names[name] if x != a] self.logger.debug("{} authors added to override_authors".format( len(self.names[name])))
def compareInfoDict(self, actual, expected): self.assertTrue("name" in actual) self.assertEqual(actual["name"], cleanName(expected["name"])) self.assertTrue("co_authors_name" in actual) expected_names = [cleanName(x) for x in expected["co_authors_name"]] for n in actual["co_authors_name"]: self.assertTrue(n in expected_names) self.assertTrue("aff_name" in actual) if not expected["aff_name"]: self.assertEqual(actual["aff_name"], expected["aff_name"]) else: self.assertEqual(actual["aff_name"], cleanName(expected["aff_name"])) self.assertTrue("address" in actual) try: self.assertDictEqual(actual["address"], expected["address"]) except Exception as e: print(actual["address"]) print(expected["address"]) raise e self.assertTrue("title" in actual) self.assertEqual(actual["title"], cleanName(expected["title"])) list_keys = [ "title_tokenized", "co_authors_id", "department", "co_authors_email", "co_authors_aff", "co_authors_aff_type", "citations", "citations_tokenized", "sections", "sections_tokenized" ] str_keys = ["aff_type", "email_user", "email_domain"] for k in list_keys: try: self.assertTrue(k in actual) except Exception as e: print(k) raise e for i in actual[k]: try: self.assertTrue(i in expected[k]) except Exception as e: print(actual["name"]) print(expected["name"]) print(i) print(expected[k]) raise e for k in str_keys: self.assertTrue(k in actual) self.assertEqual(actual[k], expected[k])
def __init__(self, papers, author_papers, id_to_name, console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None, target_path=None, save_data=False, ext_directory=False, save_path=None, cores=4): if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' if not log_path: log_path = os.getcwd() + "/logs/disambiguation.log" self.logger = createLogger("input_handler", log_path, log_format, console_log_level, file_log_level) self.console_log_level = console_log_level self.papers = papers self.author_papers = author_papers self.id_to_name = id_to_name self.names = defaultdict(list) for k, name in id_to_name.items(): name_cleaned = cleanName( remove_weird_notes.sub(" ", nameFromDict(name)).replace( " ", " ")).replace(" ", " ") self.names[name_cleaned].append(k) self.save_data = save_data self.save_path = save_path self.ext_directory = ext_directory self.override_authors = {} if not target_path: self.logger.debug("No path was passed for target_path") self.targets = [] else: self.logger.debug("Opening {}".format(target_path)) if target_path.split(".")[-1] == "json": self.logger.debug("Parsing json...") try: targets_dict = json.load(open(target_path)) except FileNotFoundError as e: self.logger.debug( "File path was not found, trying to open with adding os.getcwd()" ) target_path = "/" + target_path if target_path[ 0] == "/" else target_path targets_dict = json.load(open(os.getcwd() + target_path)) for k, v in targets_dict.items(): self.logger.debug("Found target {}".format(k)) self.targets.append(k) self.override_authors[k] = v elif target_path.split(".")[-1] == "txt": self.logger.debug("Parsing txt file...") try: self.targets = [ x.strip() for x in open(target_path).readlines() ] except FileNotFoundError as e: self.logger.debug( "File path was not found, trying to open with adding os.getcwd()" ) target_path = "/" + target_path if target_path[ 0] == "/" else target_path self.targets = [ x.strip() for x in open(os.getcwd() + target_path).readlines() ] else: self.logger.error("File type {} is not supported".format( target_path.split(".")[-1])) raise ValueError("File type {} is not supported".format( target_path.split(".")[-1])) self.logger.debug("Found {} targets".format(len(self.targets))) self.logger.debug("Found {} overrides".format( len(self.override_authors))) self.valid_main_commands = { "t": { "required": "target-id", "desc": "Specify target", "action": self._addTarget }, "d": { "required": None, "desc": "Display a target or all targets", "action": self._displayTargets }, "r": { "required": None, "desc": "Remove a target", "action": self._removeTarget }, "g": { "required": "target-id", "desc": "Generate authors to compare with based on their name", "action": self._genAuthorOverride }, "c": { "required": "target-id", "desc": "Clear target's authors to compare with", "action": self._clearAuthorOverride }, "o": { "required": None, "desc": "Display override authors", "action": self._displayOverride }, "h": { "required": None, "desc": "Help", "action": self._printHelp }, "s": { "required": None, "desc": "Save targets", "action": self._save }, "e": { "required": None, "desc": "Finish and continue", "action": None }, } self.valid_target_commands = { "a": { "required": "target-id", "optional": None, "desc": "Specify author to compare target to" }, "g": { "required": "Author Name", "optional": None, "desc": "Generate a list of authors to compare to by their name" }, "d": { "required": None, "optional": None, "desc": "Display list of authors" }, "r": { "required": "author-id", "optional": None, "desc": "Remove an author-id" }, "e": { "required": None, "optional": None, "desc": "Finish editing target" } }
def _makeAmbiguousAuthors(self, has_authors, needs_authors, override_authors): ambiguous_author_papers = defaultdict(list) ambiguous_author_names = dict() authors_get_info = list() check_author_keys = defaultdict(list) excluded = [] for i in [*has_authors, *needs_authors]: ambiguous_author_papers[i] = self.author_papers.pop(i) try: ambiguous_author_names[i] = cleanName(nameFromDict(self.id_to_name[i])).lower() del self.author_name[i] except KeyError as e: self.logger.warning("{} is not in id_to_name".format(i)) excluded.append(i) for a in has_authors: if a in excluded: self.logger.debug("Skipping {} because it is in excluded".format(a)) continue authors_get_info.extend(override_authors[a]) check_author_keys[a] = self._makeCheckAuthors(override_authors[a]) args = [] for a in needs_authors: if a in excluded: self.logger.debug("Skipping {} because it is in excluded".format(a)) continue args.append([a, ambiguous_author_names[a], self.author_name, self.str_algorithm, self.name_similarity_cutoff, self.sim_overrides]) printLogToConsole(self.console_log_level, "Getting similar authors in parallel with {} cores".format(self.cores), logging.INFO) self.logger.info("Getting similar authors in parallel with {} cores".format(self.cores)) sim_authors = [] with mp.Pool(self.cores) as Pool: imap_results = list(tqdm(Pool.imap_unordered(self._getSimilarAuthors, args), total=len(args), file=sys.stdout)) for target, auth, warnings, debug in imap_results: self.logger.debug("Adding authors from {}".format(target)) self.logger.debug("len(auth)={}".format(len(auth))) sim_authors.append([target, auth]) for i in warnings: self.logger.warning(i) for i in debug: self.logger.debug(i) self.logger.debug("len(sim_authors)={}".format(len(sim_authors))) pbar = tqdm(total=len(sim_authors), file=sys.stdout) for a, auths in sim_authors: if a in override_authors: self.logger.exception(ValueError("{} is in need authors, but is already in override_authors".format(a))) raise ValueError("{} is in need authors, but is already in override_authors".format(a)) pbar.write("INFO: Checking similar authors to {}".format(a)) self.logger.info("Checking similar authors to {}".format(a)) if len(auths) == 0: self.logger.warning("{} has no similar authors".format(a)) excluded.append(a) else: authors_get_info.extend(auths) check_author_keys[a] = self._makeCheckAuthors(auths) if len(check_author_keys[a]) == 0: self.logger.debug("{} had at least 1 similar author, but nothing in check author keys".format(a)) pbar.update() pbar.close() authors_get_info = list(set(authors_get_info)) return ambiguous_author_papers, ambiguous_author_names, check_author_keys, authors_get_info, excluded
def _getSimilarAuthors(args): target_id, target_author, author_name, str_algorithm, name_similarity_cutoff, sim_overrides = args out = [] target_initials = [w[0] for w in target_author.split()] target_author = target_author.lower() old_target_id = deepcopy(target_id) target_id = remove_numbers.sub("",target_id) warnings = [] debug = [] authors_use = [] for _id, name in author_name.items(): try: first_letter = name[0].lower() except Exception as e: raise e name = name.lower() if first_letter == target_author[0].lower(): authors_use.append([_id, name]) debug.append("{} authors with the same first letter as {}".format(len(authors_use), target_id)) for _id, name in authors_use: cleaned_name = cleanName(name).lower() tmp_id = "-".join(_id.split("-")[:len(target_initials)]) pass_sim_test = False if str_algorithm(target_id, tmp_id) * 100 >= name_similarity_cutoff * 100: pass_sim_test = True override_with_sim = sim_overrides and pass_sim_test # Do not override the first name check b/c the first name check prevents authors with the targets name in # their name from being used. # For example: # target is yang-liu # the author it is looking at is luyang-liu. # It would pass the similarity test, but we know it is not the same because the first name is if str_algorithm(cleaned_name.split()[0], target_author.split()[0]) * 100 < name_similarity_cutoff * 100: if pass_sim_test: warnings.append( "{} passed the similarity test, but does not have the same first name".format(_id)) warnings.append("author name ={}".format(name)) continue # For the initials, override does have an affect due to some people having weird notes in their name. # For example: # yang-liu-georgetown's name is Yang (Janet) Liu # For the time being, clean name does not remove the (Janet) from the name (might change later) # So yang-liu-georgetown's initials are [y,j,l]. But we WANT to compare this to the target of yang-liu, # so we override it cleaned_initials = [w[0] for w in cleaned_name.split()] same_initials = True if len(cleaned_initials) != len(target_initials) and not override_with_sim: if pass_sim_test: warnings.append( "{} passed the similarity test, but does not have the same number of initials".format(_id)) warnings.append("target name ={}".format(target_id)) warnings.append("author name ={}".format(name)) continue for i in range(min(len(target_initials),len(cleaned_initials))): if target_initials[i] != cleaned_initials[i]: same_initials = False break if not same_initials and not override_with_sim: if pass_sim_test: warnings.append("{} passed the similarity test, but does not have the same initials".format(_id)) warnings.append("target name ={}".format(target_id)) warnings.append("author name ={}".format(name)) continue if pass_sim_test: if override_with_sim and not same_initials: debug.append("{} was added due to overriding with sim score".format(_id)) debug.append("{} is similar to {}".format(_id, target_id)) out.append(_id) debug.append("Found {} similar authors".format(len(out))) return old_target_id, out, warnings, debug
def getAuthorInfo(args): paper, author = args pair_key = paper.pid + " " + author out = { "pid": paper.pid, "name": cleanName(paper.authors[author]), "co_authors_id": [], "co_authors_name": [], "co_authors_email": [], "co_authors_aff_type": [], "co_authors_aff": [] } for a in paper.authors.keys(): if a == author: continue out["co_authors_id"].append(a) out["co_authors_name"].append(cleanName(paper.authors[a])) if a in paper.affiliations: auth_aff = paper.affiliations[a] if auth_aff["email"]: out["co_authors_email"].append(auth_aff["email"].split("@")) else: out["co_authors_email"].append([None, None]) try: auth_aff_type = auth_aff["affiliation"]["type"][0] except: auth_aff_type = None if auth_aff_type: out["co_authors_aff"].append( cleanName( auth_aff["affiliation"]["info"][auth_aff_type][0])) else: out["co_authors_aff"].append(None) out["co_authors_aff_type"].append(auth_aff_type) else: out["co_authors_aff"].append(None) out["co_authors_email"].append([None, None]) out["co_authors_aff_type"].append(None) out["co_authors_aff"].append(None) aff_info = paper.affiliations[author]["affiliation"] email = paper.affiliations[author]["email"] if email: email = email.split("@") out["email_user"] = email[0] if len(email) == 2: out["email_domain"] = email[1] else: out["email_domain"] = None else: out["email_user"] = None out["email_domain"] = None try: out["aff_type"] = aff_info["type"][0] except (IndexError, KeyError) as e: out["aff_type"] = None if out["aff_type"]: out["aff_name"] = cleanName(aff_info["info"][out["aff_type"]][0]) out["department"] = aff_info["info"]["department"] else: out["aff_name"] = None out["department"] = [] try: out["address"] = aff_info["address"] if out["address"]["settlement"]: out["address"]["settlement"] = cleanName( out["address"]["settlement"]) if out["address"]["country"]: out["address"]["country"] = cleanName(out["address"]["country"]) except (IndexError, KeyError) as e: out["address"] = {} out["title"] = cleanName(paper.title) out["title_tokenized"] = paper.title_tokenized out["citations"] = paper.citations out["citations_tokenized"] = paper.citations_tokenized out["sections"] = paper.sections out["sections_tokenized"] = paper.sections_tokenized return pair_key, out