def __init__(self, papers, id_to_name, author_papers, treat_id_different_people=False, console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None, raise_error=False, skip_error_papers=False, one_target_per_paper=False, save_data=False, ext_directory=False, save_path=None, cores=4, remove_all_papers=False): if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' if not log_path: log_path = os.getcwd() + "/logs/disambiguation.log" self.logger = createLogger("disambiguator", log_path, log_format, console_log_level, file_log_level) self.console_log_level = console_log_level self.treat_id_different_people = treat_id_different_people self.papers = {} for k, p in papers.items(): if isinstance(p, Paper): self.papers = papers break self.papers[k] = Paper(**p) self.id_to_name = deepcopy(id_to_name) self.author_papers = deepcopy(author_papers) self.author_id_suffix = Counter() self.raise_error = raise_error self.error_papers = set() self.new_papers = {} self.new_author_papers = defaultdict(list) self.new_id_to_name = {} self.old_ids = set() self.skip_errors = skip_error_papers self.one_per_paper = one_target_per_paper self.save_data = save_data self.ext_directory = ext_directory self.save_path = save_path self.cores = cores self.remove_all_papers = remove_all_papers
def __init__(self, config_dict, log_file, file_log_level=logging.DEBUG, console_log_level=logging.WARNING, log_format=None, raise_error_unknown=False): self.config_dict = {} if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' self.config_dict["log_format"] = log_format if "log path" not in config_dict: log_path = os.getcwd() + "/logs/{}.log".format(log_file) self.config_dict["log path"] = log_path else: log_path = config_dict["log path"] if "\\" in log_path: print("ERROR: log path={}".format(log_path)) raise ValueError( "\\ in the log path, currently file paths with '\\' are not supported" ) if log_path[0] != "/": log_path = "/" + log_path if log_path[-1] != "/": log_path = log_path + "/" log_path = os.getcwd() + log_path + "{}.log".format(log_file) del config_dict["log path"] self.config_dict["log path"] = log_path self.raise_error_unknown = raise_error_unknown self.logger = createLogger("config_handler", log_path, log_format, console_log_level, file_log_level) self.console_log_level = console_log_level self.logger.debug("Parsing config.json") self.config_dict = {**self.config_dict, **config_dict} self.configs = { "shared": {}, "pdf_parser": {}, "acl_parser": {}, "create_training": {}, "vote_classifier": {}, "author_disambiguation": {}, "target_creator": {}, "input_handler": {}, "paths": {}, } self.configs["shared"]["log_path"] = self.config_dict["log path"] for k, v in config_dict.items(): if k == "log path": continue tmp_k = k.replace(" ", "_") if tmp_k in self.excluded_keys: self.logger.debug( "{} is in excluded, skipping it".format(tmp_k)) else: self.addArgument(tmp_k, v) if "save_path" not in self.configs["shared"]: raise KeyError("save_path is not in shared config") self._createExtraPaths()
def __init__(self, papers, author_papers, id_to_name, console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None, target_path=None, save_data=False, ext_directory=False, save_path=None, cores=4): if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' if not log_path: log_path = os.getcwd() + "/logs/disambiguation.log" self.logger = createLogger("input_handler", log_path, log_format, console_log_level, file_log_level) self.console_log_level = console_log_level self.papers = papers self.author_papers = author_papers self.id_to_name = id_to_name self.names = defaultdict(list) for k, name in id_to_name.items(): name_cleaned = cleanName( remove_weird_notes.sub(" ", nameFromDict(name)).replace( " ", " ")).replace(" ", " ") self.names[name_cleaned].append(k) self.save_data = save_data self.save_path = save_path self.ext_directory = ext_directory self.override_authors = {} if not target_path: self.logger.debug("No path was passed for target_path") self.targets = [] else: self.logger.debug("Opening {}".format(target_path)) if target_path.split(".")[-1] == "json": self.logger.debug("Parsing json...") try: targets_dict = json.load(open(target_path)) except FileNotFoundError as e: self.logger.debug( "File path was not found, trying to open with adding os.getcwd()" ) target_path = "/" + target_path if target_path[ 0] == "/" else target_path targets_dict = json.load(open(os.getcwd() + target_path)) for k, v in targets_dict.items(): self.logger.debug("Found target {}".format(k)) self.targets.append(k) self.override_authors[k] = v elif target_path.split(".")[-1] == "txt": self.logger.debug("Parsing txt file...") try: self.targets = [ x.strip() for x in open(target_path).readlines() ] except FileNotFoundError as e: self.logger.debug( "File path was not found, trying to open with adding os.getcwd()" ) target_path = "/" + target_path if target_path[ 0] == "/" else target_path self.targets = [ x.strip() for x in open(os.getcwd() + target_path).readlines() ] else: self.logger.error("File type {} is not supported".format( target_path.split(".")[-1])) raise ValueError("File type {} is not supported".format( target_path.split(".")[-1])) self.logger.debug("Found {} targets".format(len(self.targets))) self.logger.debug("Found {} overrides".format( len(self.override_authors))) self.valid_main_commands = { "t": { "required": "target-id", "desc": "Specify target", "action": self._addTarget }, "d": { "required": None, "desc": "Display a target or all targets", "action": self._displayTargets }, "r": { "required": None, "desc": "Remove a target", "action": self._removeTarget }, "g": { "required": "target-id", "desc": "Generate authors to compare with based on their name", "action": self._genAuthorOverride }, "c": { "required": "target-id", "desc": "Clear target's authors to compare with", "action": self._clearAuthorOverride }, "o": { "required": None, "desc": "Display override authors", "action": self._displayOverride }, "h": { "required": None, "desc": "Help", "action": self._printHelp }, "s": { "required": None, "desc": "Save targets", "action": self._save }, "e": { "required": None, "desc": "Finish and continue", "action": None }, } self.valid_target_commands = { "a": { "required": "target-id", "optional": None, "desc": "Specify author to compare target to" }, "g": { "required": "Author Name", "optional": None, "desc": "Generate a list of authors to compare to by their name" }, "d": { "required": None, "optional": None, "desc": "Display list of authors" }, "r": { "required": "author-id", "optional": None, "desc": "Remove an author-id" }, "e": { "required": None, "optional": None, "desc": "Finish editing target" } }
def __init__(self, data, classifiers, classifier_weights=None, test_fraction=8, save_data=False, ext_directory=False, save_path=None, model_save_path='/models/', model_name=None, special_cases=None, rand_seed=None, cutoff=None, special_only=False, console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None, diff_same_ratio=1, train_all_estimators=False, voting="hard"): if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' if not log_path: log_path = os.getcwd() + "/logs/dnn.log" if not save_path: if ext_directory: self.save_path = os.getcwd() + "/data/pickle" else: self.save_path = os.getcwd() + "/data" else: if ext_directory and "/pickle" not in save_path: save_path = save_path + "/pickle" self.save_path = save_path self.logger = createLogger("DNN", log_path, log_format, console_log_level, file_log_level) self.console_log_level = console_log_level self.test_fraction = test_fraction if special_cases is None: special_cases = [] self.special_cases = special_cases self.rand_seed = rand_seed if not self.rand_seed: self.rand_seed = random.randint(0, 9999) random.seed(self.rand_seed) self.logger.debug("seed is {}".format(self.rand_seed)) self.cutoff = cutoff self.dif_same_ratio = diff_same_ratio self.save_data = save_data self.special_only = special_only self.model_save_path = model_save_path self.model_name = model_name if not model_name: self.model_name = "VC1" if not classifier_weights: classifier_weights = {} for n, m in classifiers: classifiers[n] = 1 if len(classifier_weights) != len(classifiers): self.logger.error("len(classifier_weights)={}".format( len(classifier_weights))) self.logger.error("len(classifier)={}".format(len(classifiers))) raise ValueError( "Classifier weights must be the same length as the number of classifiers passed" ) self.classifier_weights = classifier_weights self.classifiers = classifiers self.estimators = [] self.model = None self.train_all_estimators = train_all_estimators self.classifier_params = {} self.voting = voting self.train, self.test, self.special_test, self.special_train = self._createTrainTest( data)
def __init__(self, epochs=3, layers=4, dropout=.1, activation=tf.keras.activations.relu, test_fraction=8, funnel=2, optimizer='rmsprop', loss='binary_crossentropy', metrics=None, save_path=None, model_save_path='/models/', model_name=None, load_model=None, special_cases=None, rand_seed=None, start_neurons=16, batch_size=10000, cutoff=None, normalize=True, save_pairs=False, special_only=False, console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None, diff_same_ratio=1): if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' if not log_path: log_path = os.getcwd() + "/logs/dnn.log" if not save_path: self.save_path = os.getcwd() + "/data/pickle" else: self.save_path = save_path self.logger = createLogger("DNN", log_path, log_format, console_log_level, file_log_level) self.console_log_level = console_log_level if special_cases is None: special_cases = [] if metrics is None: metrics = [BinaryAccuracy] self.special_cases = special_cases self.epochs = epochs self.funnel = funnel self.layers = layers self.dropout = dropout self.loss = loss self.optimizer = optimizer self.metrics = metrics self.activation = activation self.test_fraction = test_fraction if rand_seed: random.seed(rand_seed) self.start_neurons = start_neurons self.batch_size = batch_size self.cutoff = cutoff self.normalize = normalize self.dif_same_ratio = diff_same_ratio self.save_pairs = save_pairs self.special_only = special_only self.model = tf.keras.models.Sequential() self.train = None self.test = None self.special_train = None self.special_test = None
def __init__(self, papers=None, author_papers=None, compare_args=None, id_to_name=None, console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None, save_data=False, ext_directory=False, save_path=None, threshold=.2, name_similarity_cutoff=.92, str_algorithm="jaro-similarity", model=None, model_name="VC1", model_path=None, create_new_author=False, compare_cutoff=3, tie_breaker="max", cores=4, DEBUG_MODE=False, sim_overrides=False, allow_authors_not_in_override=True, same_paper_diff_people=True, use_probabilities=False): if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' if not log_path: log_path = os.getcwd() + "/logs/disambiguation.log" self.logger = createLogger("author_disambiguation", log_path, log_format, console_log_level, file_log_level) self.console_log_level = console_log_level self.model = model self.model_name = model_name if self.model is None: if not model_path: model_path = os.getcwd() self.model = pickle.load(open("{}/models/{}/model.pickle".format(model_path, model_name), "rb")) try: if self.model.voting == "hard" and use_probabilities: self.logger.warning("hard voting does not support probabilities") self.use_probabilities = False else: self.use_probabilities = use_probabilities except Exception as e: self.logger.debug("model does not have voting") self.use_probabilities = False if not DEBUG_MODE: # Argument validation if compare_args and not isinstance(compare_args, dict): self.logger.error("passed compare_args is not valid") self.logger.exception(TypeError("compare_args is not a dict")) raise TypeError("compare_args is not a dict") elif not compare_args: self.logger.error("passed compare_args is not valid") self.logger.exception(ValueError("compare_args is None")) raise ValueError("compare_args is None") else: self.compare_args = compare_args if author_papers and (not isinstance(author_papers, dict) and not isinstance(author_papers, defaultdict)): self.logger.error("passed author_papers is not valid") self.logger.error("type is {}".format(type(author_papers))) self.logger.exception(TypeError("author_papers is not a dict")) raise TypeError("author_papers is not a dict") elif not author_papers: author_papers, status, error_msg = self._findData("author_papers.json") if status != 0: self.logger.error( "passed author_papers is not valid and could not find the file author_papers.json") self.logger.error("self._findData(\"author_papers.json\") returned error {}".format(error_msg)) self.logger.exception(ValueError("No valid author_papers found")) raise ValueError("No valid author_papers found") else: self.author_papers = deepcopy(author_papers) else: self.author_papers = deepcopy(author_papers) if papers and not isinstance(papers, dict): self.logger.error("passed papers is not valid") self.logger.exception(TypeError("papers is not a dict")) raise TypeError("papers is not a dict") elif not papers: papers, status, error_msg = self._findData("parsed_papers.json") if status != 0: self.logger.error("passed papers is not valid and could not find the file parsed_papers.json") self.logger.error("self._findData(\"parsed_papers.json\") returned error {}".format(error_msg)) self.logger.exception(ValueError("No valid parsed_papers found")) raise ValueError("No valid parsed_papers found") else: if len(papers) == 0: self.logger.exception(ValueError("Found papers is empty")) raise ValueError("Found papers is empty") self.logger.debug("Converting papers from dict to Paper object") self.papers = {} for k, info in papers.items(): self.papers[k] = Paper(**info) else: if len(papers) == 0: self.logger.exception(ValueError("Passed papers is empty")) raise ValueError("Passed papers is empty") test_key = list(papers.keys())[0] if isinstance(test_key, dict): self.papers = {} for k, info in papers.items(): try: self.papers[k] = Paper(**info) except Exception as e: self.logger.error("Exception raised when converting paper dicts to Paper") self.logger.error("k={}".format(k)) self.logger.error("info={}".format(info)) self.logger.exception(e) raise e else: self.papers = papers if id_to_name and not isinstance(id_to_name, dict): self.logger.error("passed id_to_name is not valid") self.logger.exception(TypeError("id_to_name is not a dict")) raise TypeError("id_to_name is not a dict") elif not id_to_name: id_to_name, status, error_msg = self._findData("id_to_name.json") if status != 0: self.logger.error("passed id_to_name is not valid and could not find the file parsed_papers.json") self.logger.error("self._findData(\"id_to_name.json\") returned error {}".format(error_msg)) self.logger.exception(ValueError("No valid id_to_name found")) raise ValueError("No valid id_to_name found") else: if len(id_to_name) == 0: self.logger.exception(ValueError("Found id_to_name is empty")) raise ValueError("Found id_to_name is empty") self.id_to_name = id_to_name else: if len(id_to_name) == 0: self.logger.exception(ValueError("Passed id_to_name is empty")) raise ValueError("Passed id_to_name is empty") self.id_to_name = id_to_name else: printLogToConsole(self.console_log_level, "RUNNING IN DEBUG_MODE!", logging.WARNING) self.logger.warning("Running in DEBUG_MODE") self.id_to_name = id_to_name if id_to_name else {} self.papers = papers if papers else {} self.compare_args = compare_args if compare_args else {} self.author_papers = author_papers if author_papers else {} self.compare_terms = len(CompareAuthors.compare_terms) self.save_data = save_data self.save_dir = save_path self.ext_directory = ext_directory self.threshold = threshold self.name_similarity_cutoff = name_similarity_cutoff algo_name, measure = str_algorithm.split("-") self.author_name = {x: nameFromDict(self.id_to_name[x]) for x in self.id_to_name.keys()} self.cores = cores self.str_algorithm = getAlgo(algo_name, measure) self.create_new_author = create_new_author self.compare_cutoff = compare_cutoff self.tie_breaker = tie_breaker self.sim_overrides = sim_overrides self.allow_authors_not_in_override = allow_authors_not_in_override self.same_paper_diff_people = same_paper_diff_people self.logger.debug("AuthorDisambiguation initialized with arguments:") self.logger.debug("\tcompare_args={}".format(list(self.compare_args.keys()))) self.logger.debug("\talgorithm={}".format(algo_name)) self.logger.debug("\tmeasure={}".format(measure)) self.logger.debug("\tthreshold={}".format(threshold)) self.logger.debug("\tname_similarity_cutoff={}".format(name_similarity_cutoff)) self.logger.debug("\tunique authors={}".format(len(self.author_papers))) self.logger.debug("\tcompare_cutoff={}".format(self.compare_cutoff)) self.logger.debug("\ttie_breaker={}".format(self.tie_breaker)) self.logger.debug("\tsim_overrides={}".format(self.sim_overrides)) self.logger.debug("\tsame_paper_diff_people={}".format(self.same_paper_diff_people)) self.logger.debug("\tuse_probabilities={}".format(self.use_probabilities)) if self.compare_cutoff != 3: self.logger.warning("Non-default value for compare_cutoff, currently this is not implemented")
def __init__(self, papers, incomplete_papers, special_keys=None, save_data=False, ext_directory=False, save_path=None, diff_same_ratio=1.0, author_cutoff=0, name_similarity_cutoff=.95, pair_distribution="random", separate_chars=1, separate_words=1, algorithm="jaro-similarity", exclude=None, rand_seed=None, cores=4, batch_size=25000, allow_exact_special=True, min_batch_len=100000, file_log_level=logging.DEBUG, console_log_level=logging.WARNING, log_format=None, log_path=None, DEBUG_MODE=False, drop_null_authors=True, print_compare_stats=False, compare_args=None, compare_batch_size=1000, remove_single_author=False, require_exact_match=False): """ Initialize the class :param papers: The parsed papers you want to use (dict of Paper objects) :param incomplete_papers: Papers you want to exclude (list of paper ids) :param special_keys: Any special keys you want to guarantee are in the training data (list of ids, defaults to empty list) :param save_data: Save data for later use (Bool, default is false) :param ext_directory: Save data into directories based on their file type (Bool, defaults to false) :param save_path: Directory to save data (str, defaults to none) :param diff_same_ratio: Ratio of same pairs to different pairs, and vice-versa. (float, default is 2) :param author_cutoff: Cutoff authors based on paper count (int, default is 10) :param name_similarity_cutoff: Exclude pairs if their names aren't similar enough (float, default is .6) :param pair_distribution: how to distribute pairs to meet ratio Options are 'random' and 'sim distribution'. sim distribution tries to get an even number of pairs based on how similar their names are (default is random) :param separate_chars: How many chars you want to use in the pair dict (int, default is 1) :param separate_words: # of words to use in the pair dict (int, default is 1) :param algorithm: string similarity algorithm(str in format of 'algorithm name-measure' default is 'jaro-similarity') :param exclude: authors to exclude(list of ids, default is empty list) :param rand_seed: Random seed :param cores: Cores to use (int, default is 4) :param allow_exact_special: allow ids that are exactly equal to special cases (bool, default is false) :param batch_size: Size of batches (int, default is 25000) :param min_batch_len: Minimum number of combinations to use batches (int, default is 100,000) :param file_log_level: logging level to file (logging.levels, default is debug) :param console_log_level: logging level to console (logging.levels, default is debug) :param log_format: format of log messages (str) :param log_path: path to log files(str, default is '/logs/preprocess_data.log' :param DEBUG_MODE: debugging mode (bool, default is false) :param drop_null_authors: drop authors with either no email or no affiliation (bool, default is True) :param print_compare_stats: print the indepth stats of comparisons. This WILL slow down the program by a lot(bool, default is False) :param compare_args: dict of arguments to pass to compareAuthors :param compare_batch_size: size of batches for comparing authors, only has an effect when cores > 1 :param remove_single_author: Remove papers with only 1 author :param require_exact_match: If special cases must be exact match """ if compare_args is None: compare_args = {} if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' if not log_path: log_path = os.getcwd() + "/logs/create_training_data.log" self.papers = papers self.incomplete_papers = incomplete_papers self.special_keys = special_keys if not self.special_keys: self.special_keys = [] self.cores = cores self.save_data = save_data self.ext_directory = ext_directory self.save_dir = save_path self.dif_same_ratio = diff_same_ratio self.author_cutoff = author_cutoff self.name_similarity_cutoff = name_similarity_cutoff self.all_author_papers = defaultdict(list) self.valid_author_papers = defaultdict(list) self.pair_distribution = pair_distribution if rand_seed: random.seed(rand_seed) self.separate_chars = separate_chars self.separate_words = separate_words self.exclude = exclude if exclude else [] self.batch_size = batch_size self.allow_exact_special = allow_exact_special self.algorithm = algorithm.split("-") self.min_batch_len = min_batch_len self.json_path = self.save_dir self.csv_path = self.save_dir self.txt_path = self.save_dir self.pickle_path = self.save_dir self.debug_mode = DEBUG_MODE self.console_log_level = console_log_level self.drop_null_authors = drop_null_authors self.compare_args = compare_args self.compare_args["str_algorithm"] = algorithm.split("-") self.compare_batch_size = compare_batch_size if self.ext_directory: self.json_path = self.json_path + "/json" self.csv_path = self.csv_path + "/csv" self.txt_path = self.txt_path + "/txt" self.pickle_path = self.pickle_path + "/pickle" if not os.path.exists(self.json_path): os.mkdir(self.json_path) if not os.path.exists(self.csv_path): os.mkdir(self.csv_path) if not os.path.exists(self.txt_path): os.mkdir(self.txt_path) if not os.path.exists(self.pickle_path): os.mkdir(self.pickle_path) self.logger = createLogger("create_training_data", log_path, log_format, console_log_level, file_log_level) self.print_compare_stats = print_compare_stats self.remove_single_author = remove_single_author self.require_exact_match = require_exact_match self.logger.debug("{} authors in self.excluded".format( len(self.exclude))) gc.collect()