Example #1
0
 def __init__(self, papers, id_to_name, author_papers, treat_id_different_people=False,
              console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None,
              raise_error=False, skip_error_papers=False, one_target_per_paper=False, save_data=False, ext_directory=False, save_path=None,
              cores=4, remove_all_papers=False):
     if not log_format:
         log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s'
     if not log_path:
         log_path = os.getcwd() + "/logs/disambiguation.log"
     self.logger = createLogger("disambiguator", log_path, log_format, console_log_level, file_log_level)
     self.console_log_level = console_log_level
     self.treat_id_different_people = treat_id_different_people
     self.papers = {}
     for k, p in papers.items():
         if isinstance(p, Paper):
             self.papers = papers
             break
         self.papers[k] = Paper(**p)
     self.id_to_name = deepcopy(id_to_name)
     self.author_papers = deepcopy(author_papers)
     self.author_id_suffix = Counter()
     self.raise_error = raise_error
     self.error_papers = set()
     self.new_papers = {}
     self.new_author_papers = defaultdict(list)
     self.new_id_to_name = {}
     self.old_ids = set()
     self.skip_errors = skip_error_papers
     self.one_per_paper = one_target_per_paper
     self.save_data = save_data
     self.ext_directory = ext_directory
     self.save_path = save_path
     self.cores = cores
     self.remove_all_papers = remove_all_papers
    def __init__(self,
                 config_dict,
                 log_file,
                 file_log_level=logging.DEBUG,
                 console_log_level=logging.WARNING,
                 log_format=None,
                 raise_error_unknown=False):
        self.config_dict = {}
        if not log_format:
            log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s'
            self.config_dict["log_format"] = log_format
        if "log path" not in config_dict:
            log_path = os.getcwd() + "/logs/{}.log".format(log_file)
            self.config_dict["log path"] = log_path
        else:
            log_path = config_dict["log path"]
            if "\\" in log_path:
                print("ERROR: log path={}".format(log_path))
                raise ValueError(
                    "\\ in the log path, currently file paths with '\\' are not supported"
                )
            if log_path[0] != "/":
                log_path = "/" + log_path
            if log_path[-1] != "/":
                log_path = log_path + "/"
            log_path = os.getcwd() + log_path + "{}.log".format(log_file)
            del config_dict["log path"]
            self.config_dict["log path"] = log_path

        self.raise_error_unknown = raise_error_unknown
        self.logger = createLogger("config_handler", log_path, log_format,
                                   console_log_level, file_log_level)
        self.console_log_level = console_log_level
        self.logger.debug("Parsing config.json")
        self.config_dict = {**self.config_dict, **config_dict}
        self.configs = {
            "shared": {},
            "pdf_parser": {},
            "acl_parser": {},
            "create_training": {},
            "vote_classifier": {},
            "author_disambiguation": {},
            "target_creator": {},
            "input_handler": {},
            "paths": {},
        }
        self.configs["shared"]["log_path"] = self.config_dict["log path"]
        for k, v in config_dict.items():
            if k == "log path":
                continue
            tmp_k = k.replace(" ", "_")
            if tmp_k in self.excluded_keys:
                self.logger.debug(
                    "{} is in excluded, skipping it".format(tmp_k))
            else:
                self.addArgument(tmp_k, v)
        if "save_path" not in self.configs["shared"]:
            raise KeyError("save_path is not in shared config")
        self._createExtraPaths()
    def __init__(self,
                 papers,
                 author_papers,
                 id_to_name,
                 console_log_level=logging.ERROR,
                 file_log_level=logging.DEBUG,
                 log_format=None,
                 log_path=None,
                 target_path=None,
                 save_data=False,
                 ext_directory=False,
                 save_path=None,
                 cores=4):
        if not log_format:
            log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s'
        if not log_path:
            log_path = os.getcwd() + "/logs/disambiguation.log"
        self.logger = createLogger("input_handler", log_path, log_format,
                                   console_log_level, file_log_level)
        self.console_log_level = console_log_level
        self.papers = papers
        self.author_papers = author_papers
        self.id_to_name = id_to_name
        self.names = defaultdict(list)
        for k, name in id_to_name.items():
            name_cleaned = cleanName(
                remove_weird_notes.sub(" ", nameFromDict(name)).replace(
                    "  ", " ")).replace("  ", " ")
            self.names[name_cleaned].append(k)
        self.save_data = save_data
        self.save_path = save_path
        self.ext_directory = ext_directory
        self.override_authors = {}
        if not target_path:
            self.logger.debug("No path was passed for target_path")
            self.targets = []
        else:
            self.logger.debug("Opening {}".format(target_path))
            if target_path.split(".")[-1] == "json":
                self.logger.debug("Parsing json...")
                try:
                    targets_dict = json.load(open(target_path))
                except FileNotFoundError as e:
                    self.logger.debug(
                        "File path was not found, trying to open with adding os.getcwd()"
                    )
                    target_path = "/" + target_path if target_path[
                        0] == "/" else target_path
                    targets_dict = json.load(open(os.getcwd() + target_path))
                for k, v in targets_dict.items():
                    self.logger.debug("Found target {}".format(k))
                    self.targets.append(k)
                    self.override_authors[k] = v
            elif target_path.split(".")[-1] == "txt":
                self.logger.debug("Parsing txt file...")
                try:
                    self.targets = [
                        x.strip() for x in open(target_path).readlines()
                    ]
                except FileNotFoundError as e:
                    self.logger.debug(
                        "File path was not found, trying to open with adding os.getcwd()"
                    )
                    target_path = "/" + target_path if target_path[
                        0] == "/" else target_path
                    self.targets = [
                        x.strip()
                        for x in open(os.getcwd() + target_path).readlines()
                    ]
            else:
                self.logger.error("File type {} is not supported".format(
                    target_path.split(".")[-1]))
                raise ValueError("File type {} is not supported".format(
                    target_path.split(".")[-1]))

            self.logger.debug("Found {} targets".format(len(self.targets)))
            self.logger.debug("Found {} overrides".format(
                len(self.override_authors)))

        self.valid_main_commands = {
            "t": {
                "required": "target-id",
                "desc": "Specify target",
                "action": self._addTarget
            },
            "d": {
                "required": None,
                "desc": "Display a target or all targets",
                "action": self._displayTargets
            },
            "r": {
                "required": None,
                "desc": "Remove a target",
                "action": self._removeTarget
            },
            "g": {
                "required": "target-id",
                "desc": "Generate authors to compare with based on their name",
                "action": self._genAuthorOverride
            },
            "c": {
                "required": "target-id",
                "desc": "Clear target's authors to compare with",
                "action": self._clearAuthorOverride
            },
            "o": {
                "required": None,
                "desc": "Display override authors",
                "action": self._displayOverride
            },
            "h": {
                "required": None,
                "desc": "Help",
                "action": self._printHelp
            },
            "s": {
                "required": None,
                "desc": "Save targets",
                "action": self._save
            },
            "e": {
                "required": None,
                "desc": "Finish and continue",
                "action": None
            },
        }
        self.valid_target_commands = {
            "a": {
                "required": "target-id",
                "optional": None,
                "desc": "Specify author to compare target to"
            },
            "g": {
                "required": "Author Name",
                "optional": None,
                "desc":
                "Generate a list of authors to compare to by their name"
            },
            "d": {
                "required": None,
                "optional": None,
                "desc": "Display list of authors"
            },
            "r": {
                "required": "author-id",
                "optional": None,
                "desc": "Remove an author-id"
            },
            "e": {
                "required": None,
                "optional": None,
                "desc": "Finish editing target"
            }
        }
Example #4
0
    def __init__(self,
                 data,
                 classifiers,
                 classifier_weights=None,
                 test_fraction=8,
                 save_data=False,
                 ext_directory=False,
                 save_path=None,
                 model_save_path='/models/',
                 model_name=None,
                 special_cases=None,
                 rand_seed=None,
                 cutoff=None,
                 special_only=False,
                 console_log_level=logging.ERROR,
                 file_log_level=logging.DEBUG,
                 log_format=None,
                 log_path=None,
                 diff_same_ratio=1,
                 train_all_estimators=False,
                 voting="hard"):

        if not log_format:
            log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s'
        if not log_path:
            log_path = os.getcwd() + "/logs/dnn.log"
        if not save_path:
            if ext_directory:
                self.save_path = os.getcwd() + "/data/pickle"
            else:
                self.save_path = os.getcwd() + "/data"
        else:
            if ext_directory and "/pickle" not in save_path:
                save_path = save_path + "/pickle"
            self.save_path = save_path
        self.logger = createLogger("DNN", log_path, log_format,
                                   console_log_level, file_log_level)
        self.console_log_level = console_log_level
        self.test_fraction = test_fraction
        if special_cases is None:
            special_cases = []
        self.special_cases = special_cases
        self.rand_seed = rand_seed
        if not self.rand_seed:
            self.rand_seed = random.randint(0, 9999)
            random.seed(self.rand_seed)

        self.logger.debug("seed is {}".format(self.rand_seed))
        self.cutoff = cutoff
        self.dif_same_ratio = diff_same_ratio
        self.save_data = save_data
        self.special_only = special_only
        self.model_save_path = model_save_path
        self.model_name = model_name
        if not model_name:
            self.model_name = "VC1"
        if not classifier_weights:
            classifier_weights = {}
            for n, m in classifiers:
                classifiers[n] = 1
        if len(classifier_weights) != len(classifiers):
            self.logger.error("len(classifier_weights)={}".format(
                len(classifier_weights)))
            self.logger.error("len(classifier)={}".format(len(classifiers)))
            raise ValueError(
                "Classifier weights must be the same length as the number of classifiers passed"
            )
        self.classifier_weights = classifier_weights
        self.classifiers = classifiers
        self.estimators = []
        self.model = None
        self.train_all_estimators = train_all_estimators
        self.classifier_params = {}
        self.voting = voting
        self.train, self.test, self.special_test, self.special_train = self._createTrainTest(
            data)
    def __init__(self,
                 epochs=3,
                 layers=4,
                 dropout=.1,
                 activation=tf.keras.activations.relu,
                 test_fraction=8,
                 funnel=2,
                 optimizer='rmsprop',
                 loss='binary_crossentropy',
                 metrics=None,
                 save_path=None,
                 model_save_path='/models/',
                 model_name=None,
                 load_model=None,
                 special_cases=None,
                 rand_seed=None,
                 start_neurons=16,
                 batch_size=10000,
                 cutoff=None,
                 normalize=True,
                 save_pairs=False,
                 special_only=False,
                 console_log_level=logging.ERROR,
                 file_log_level=logging.DEBUG,
                 log_format=None,
                 log_path=None,
                 diff_same_ratio=1):
        if not log_format:
            log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s'
        if not log_path:
            log_path = os.getcwd() + "/logs/dnn.log"
        if not save_path:
            self.save_path = os.getcwd() + "/data/pickle"
        else:
            self.save_path = save_path
        self.logger = createLogger("DNN", log_path, log_format,
                                   console_log_level, file_log_level)
        self.console_log_level = console_log_level

        if special_cases is None:
            special_cases = []
        if metrics is None:
            metrics = [BinaryAccuracy]

        self.special_cases = special_cases
        self.epochs = epochs
        self.funnel = funnel
        self.layers = layers
        self.dropout = dropout
        self.loss = loss
        self.optimizer = optimizer
        self.metrics = metrics
        self.activation = activation
        self.test_fraction = test_fraction
        if rand_seed:
            random.seed(rand_seed)
        self.start_neurons = start_neurons
        self.batch_size = batch_size
        self.cutoff = cutoff
        self.normalize = normalize
        self.dif_same_ratio = diff_same_ratio
        self.save_pairs = save_pairs
        self.special_only = special_only
        self.model = tf.keras.models.Sequential()
        self.train = None
        self.test = None
        self.special_train = None
        self.special_test = None
    def __init__(self, papers=None, author_papers=None, compare_args=None, id_to_name=None,
                 console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None,
                 save_data=False, ext_directory=False, save_path=None, threshold=.2, name_similarity_cutoff=.92,
                 str_algorithm="jaro-similarity", model=None, model_name="VC1", model_path=None,
                 create_new_author=False, compare_cutoff=3, tie_breaker="max", cores=4, DEBUG_MODE=False,
                 sim_overrides=False, allow_authors_not_in_override=True, same_paper_diff_people=True, use_probabilities=False):
        if not log_format:
            log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s'
        if not log_path:
            log_path = os.getcwd() + "/logs/disambiguation.log"
        self.logger = createLogger("author_disambiguation", log_path, log_format, console_log_level, file_log_level)
        self.console_log_level = console_log_level
        self.model = model
        self.model_name = model_name
        if self.model is None:
            if not model_path:
                model_path = os.getcwd()
            self.model = pickle.load(open("{}/models/{}/model.pickle".format(model_path, model_name), "rb"))
        try:
            if self.model.voting == "hard" and use_probabilities:
                self.logger.warning("hard voting does not support probabilities")
                self.use_probabilities = False
            else:
                self.use_probabilities = use_probabilities
        except Exception as e:
            self.logger.debug("model does not have voting")
            self.use_probabilities = False

        if not DEBUG_MODE:
            # Argument validation
            if compare_args and not isinstance(compare_args, dict):
                self.logger.error("passed compare_args is not valid")
                self.logger.exception(TypeError("compare_args is not a dict"))
                raise TypeError("compare_args is not a dict")
            elif not compare_args:
                self.logger.error("passed compare_args is not valid")
                self.logger.exception(ValueError("compare_args is None"))
                raise ValueError("compare_args is None")
            else:
                self.compare_args = compare_args

            if author_papers and (not isinstance(author_papers, dict) and not isinstance(author_papers, defaultdict)):
                self.logger.error("passed author_papers is not valid")
                self.logger.error("type is {}".format(type(author_papers)))
                self.logger.exception(TypeError("author_papers is not a dict"))
                raise TypeError("author_papers is not a dict")
            elif not author_papers:
                author_papers, status, error_msg = self._findData("author_papers.json")
                if status != 0:
                    self.logger.error(
                        "passed author_papers is not valid and could not find the file author_papers.json")
                    self.logger.error("self._findData(\"author_papers.json\") returned error {}".format(error_msg))
                    self.logger.exception(ValueError("No valid author_papers found"))
                    raise ValueError("No valid author_papers found")
                else:
                    self.author_papers = deepcopy(author_papers)
            else:
                self.author_papers = deepcopy(author_papers)

            if papers and not isinstance(papers, dict):
                self.logger.error("passed papers is not valid")
                self.logger.exception(TypeError("papers is not a dict"))
                raise TypeError("papers is not a dict")
            elif not papers:
                papers, status, error_msg = self._findData("parsed_papers.json")
                if status != 0:
                    self.logger.error("passed papers is not valid and could not find the file parsed_papers.json")
                    self.logger.error("self._findData(\"parsed_papers.json\") returned error {}".format(error_msg))
                    self.logger.exception(ValueError("No valid parsed_papers found"))
                    raise ValueError("No valid parsed_papers found")
                else:
                    if len(papers) == 0:
                        self.logger.exception(ValueError("Found papers is empty"))
                        raise ValueError("Found papers is empty")
                    self.logger.debug("Converting papers from dict to Paper object")
                    self.papers = {}
                    for k, info in papers.items():
                        self.papers[k] = Paper(**info)

            else:
                if len(papers) == 0:
                    self.logger.exception(ValueError("Passed papers is empty"))
                    raise ValueError("Passed papers is empty")
                test_key = list(papers.keys())[0]
                if isinstance(test_key, dict):
                    self.papers = {}
                    for k, info in papers.items():
                        try:
                            self.papers[k] = Paper(**info)
                        except Exception as e:
                            self.logger.error("Exception raised when converting paper dicts to Paper")
                            self.logger.error("k={}".format(k))
                            self.logger.error("info={}".format(info))
                            self.logger.exception(e)
                            raise e
                else:
                    self.papers = papers

            if id_to_name and not isinstance(id_to_name, dict):
                self.logger.error("passed id_to_name is not valid")
                self.logger.exception(TypeError("id_to_name is not a dict"))
                raise TypeError("id_to_name is not a dict")
            elif not id_to_name:
                id_to_name, status, error_msg = self._findData("id_to_name.json")
                if status != 0:
                    self.logger.error("passed id_to_name is not valid and could not find the file parsed_papers.json")
                    self.logger.error("self._findData(\"id_to_name.json\") returned error {}".format(error_msg))
                    self.logger.exception(ValueError("No valid id_to_name found"))
                    raise ValueError("No valid id_to_name found")
                else:
                    if len(id_to_name) == 0:
                        self.logger.exception(ValueError("Found id_to_name is empty"))
                        raise ValueError("Found id_to_name is empty")
                    self.id_to_name = id_to_name

            else:
                if len(id_to_name) == 0:
                    self.logger.exception(ValueError("Passed id_to_name is empty"))
                    raise ValueError("Passed id_to_name is empty")
                self.id_to_name = id_to_name
        else:
            printLogToConsole(self.console_log_level, "RUNNING IN DEBUG_MODE!", logging.WARNING)
            self.logger.warning("Running in DEBUG_MODE")
            self.id_to_name = id_to_name if id_to_name else {}
            self.papers = papers if papers else {}
            self.compare_args = compare_args if compare_args else {}
            self.author_papers = author_papers if author_papers else {}
        self.compare_terms = len(CompareAuthors.compare_terms)
        self.save_data = save_data
        self.save_dir = save_path
        self.ext_directory = ext_directory
        self.threshold = threshold
        self.name_similarity_cutoff = name_similarity_cutoff
        algo_name, measure = str_algorithm.split("-")
        self.author_name = {x: nameFromDict(self.id_to_name[x]) for x in self.id_to_name.keys()}
        self.cores = cores
        self.str_algorithm = getAlgo(algo_name, measure)
        self.create_new_author = create_new_author
        self.compare_cutoff = compare_cutoff
        self.tie_breaker = tie_breaker
        self.sim_overrides = sim_overrides
        self.allow_authors_not_in_override = allow_authors_not_in_override
        self.same_paper_diff_people = same_paper_diff_people
        self.logger.debug("AuthorDisambiguation initialized with arguments:")
        self.logger.debug("\tcompare_args={}".format(list(self.compare_args.keys())))
        self.logger.debug("\talgorithm={}".format(algo_name))
        self.logger.debug("\tmeasure={}".format(measure))
        self.logger.debug("\tthreshold={}".format(threshold))
        self.logger.debug("\tname_similarity_cutoff={}".format(name_similarity_cutoff))
        self.logger.debug("\tunique authors={}".format(len(self.author_papers)))
        self.logger.debug("\tcompare_cutoff={}".format(self.compare_cutoff))
        self.logger.debug("\ttie_breaker={}".format(self.tie_breaker))
        self.logger.debug("\tsim_overrides={}".format(self.sim_overrides))
        self.logger.debug("\tsame_paper_diff_people={}".format(self.same_paper_diff_people))
        self.logger.debug("\tuse_probabilities={}".format(self.use_probabilities))
        if self.compare_cutoff != 3:
            self.logger.warning("Non-default value for compare_cutoff, currently this is not implemented")
    def __init__(self,
                 papers,
                 incomplete_papers,
                 special_keys=None,
                 save_data=False,
                 ext_directory=False,
                 save_path=None,
                 diff_same_ratio=1.0,
                 author_cutoff=0,
                 name_similarity_cutoff=.95,
                 pair_distribution="random",
                 separate_chars=1,
                 separate_words=1,
                 algorithm="jaro-similarity",
                 exclude=None,
                 rand_seed=None,
                 cores=4,
                 batch_size=25000,
                 allow_exact_special=True,
                 min_batch_len=100000,
                 file_log_level=logging.DEBUG,
                 console_log_level=logging.WARNING,
                 log_format=None,
                 log_path=None,
                 DEBUG_MODE=False,
                 drop_null_authors=True,
                 print_compare_stats=False,
                 compare_args=None,
                 compare_batch_size=1000,
                 remove_single_author=False,
                 require_exact_match=False):
        """
        Initialize the class
        :param papers: The parsed papers you want to use (dict of Paper objects)
        :param incomplete_papers: Papers you want to exclude (list of paper ids)
        :param special_keys: Any special keys you want to guarantee are in the training data (list of ids,
        defaults to empty list)
        :param save_data: Save data for later use (Bool, default is false)
        :param ext_directory: Save data into directories based on their file type (Bool, defaults to false)
        :param save_path: Directory to save data (str, defaults to none)
        :param diff_same_ratio: Ratio of same pairs to different pairs, and vice-versa. (float, default is 2)
        :param author_cutoff: Cutoff authors based on paper count (int, default is 10)
        :param name_similarity_cutoff: Exclude pairs if their names aren't similar enough (float, default is .6)
        :param pair_distribution: how to distribute pairs to meet ratio Options are 'random' and 'sim distribution'.
        sim distribution tries to get an even number of pairs based on how similar their names are (default is random)
        :param separate_chars: How many chars you want to use in the pair dict (int, default is 1)
        :param separate_words: # of words to use in the pair dict (int, default is 1)
        :param algorithm: string similarity algorithm(str in format of 'algorithm name-measure' default is
        'jaro-similarity')
        :param exclude: authors to exclude(list of ids, default is empty list)
        :param rand_seed: Random seed
        :param cores: Cores to use (int, default is 4)
        :param allow_exact_special: allow ids that are exactly equal to special cases (bool, default is false)
        :param batch_size: Size of batches (int, default is 25000)
        :param min_batch_len: Minimum number of combinations to use batches (int, default is 100,000)
        :param file_log_level: logging level to file (logging.levels, default is debug)
        :param console_log_level: logging level to console (logging.levels, default is debug)
        :param log_format: format of log messages (str)
        :param log_path: path to log files(str, default is '/logs/preprocess_data.log'
        :param DEBUG_MODE: debugging mode (bool, default is false)
        :param drop_null_authors: drop authors with either no email or no affiliation (bool, default is True)
        :param print_compare_stats: print the indepth stats of comparisons. This WILL slow
        down the program by a lot(bool, default is False)
        :param compare_args: dict of arguments to pass to compareAuthors
        :param compare_batch_size: size of batches for comparing authors, only has an effect when cores > 1
        :param remove_single_author: Remove papers with only 1 author
        :param require_exact_match: If special cases must be exact match
        """
        if compare_args is None:
            compare_args = {}
        if not log_format:
            log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s'
        if not log_path:
            log_path = os.getcwd() + "/logs/create_training_data.log"

        self.papers = papers
        self.incomplete_papers = incomplete_papers
        self.special_keys = special_keys
        if not self.special_keys:
            self.special_keys = []
        self.cores = cores
        self.save_data = save_data
        self.ext_directory = ext_directory
        self.save_dir = save_path
        self.dif_same_ratio = diff_same_ratio
        self.author_cutoff = author_cutoff
        self.name_similarity_cutoff = name_similarity_cutoff
        self.all_author_papers = defaultdict(list)
        self.valid_author_papers = defaultdict(list)
        self.pair_distribution = pair_distribution
        if rand_seed:
            random.seed(rand_seed)
        self.separate_chars = separate_chars
        self.separate_words = separate_words
        self.exclude = exclude if exclude else []
        self.batch_size = batch_size
        self.allow_exact_special = allow_exact_special
        self.algorithm = algorithm.split("-")
        self.min_batch_len = min_batch_len
        self.json_path = self.save_dir
        self.csv_path = self.save_dir
        self.txt_path = self.save_dir
        self.pickle_path = self.save_dir
        self.debug_mode = DEBUG_MODE
        self.console_log_level = console_log_level
        self.drop_null_authors = drop_null_authors
        self.compare_args = compare_args
        self.compare_args["str_algorithm"] = algorithm.split("-")
        self.compare_batch_size = compare_batch_size
        if self.ext_directory:
            self.json_path = self.json_path + "/json"
            self.csv_path = self.csv_path + "/csv"
            self.txt_path = self.txt_path + "/txt"
            self.pickle_path = self.pickle_path + "/pickle"
            if not os.path.exists(self.json_path):
                os.mkdir(self.json_path)
            if not os.path.exists(self.csv_path):
                os.mkdir(self.csv_path)
            if not os.path.exists(self.txt_path):
                os.mkdir(self.txt_path)
            if not os.path.exists(self.pickle_path):
                os.mkdir(self.pickle_path)

        self.logger = createLogger("create_training_data", log_path,
                                   log_format, console_log_level,
                                   file_log_level)

        self.print_compare_stats = print_compare_stats
        self.remove_single_author = remove_single_author
        self.require_exact_match = require_exact_match
        self.logger.debug("{} authors in self.excluded".format(
            len(self.exclude)))
        gc.collect()