def _compareAmbiguousPairs(self, pairs_to_use):
     printLogToConsole(self.console_log_level, "Comparing all ambiguous pairs", logging.INFO)
     self.logger.info("Comparing all ambiguous pairs")
     try:
         comparator = CompareAuthors(**self.compare_args)
     except Exception as e:
         self.logger.error("Error intializing comparator")
         self.logger.error("comparator_args={}".format(list(self.compare_args.keys())))
         self.logger.exception(e)
         raise e
     out = {}
     if self.cores == 1:
         self.logger.debug("Using 1 core")
         pbar = tqdm(total=len(pairs_to_use), file=sys.stdout)
         for k, pairs in pairs_to_use.items():
             target, compare_results = self._compareAuthors([comparator, k, pairs])
             out[target] = compare_results
             pbar.update()
         pbar.close()
         return out
     else:
         self.logger.debug("Using {} cores".format(self.cores))
         args = [[comparator, k, pairs] for k, pairs in pairs_to_use.items()]
         with mp.Pool(self.cores) as Pool:
             imap_results = list(
                 tqdm(Pool.imap_unordered(self._compareAuthors, args), total=len(args), file=sys.stdout))
         for k, res in imap_results:
             out[k] = res
         return out
    def handleUserInput(self):
        printLogToConsole(self.console_log_level,
                          "Found {} targets".format(len(self.targets)),
                          logging.INFO,
                          logger=self.logger)
        print("INFO: Enter target-ids:")
        while 1:
            user_input = input(">> ")
            if user_input == "\n" or len(user_input) == 0:
                self.logger.debug("Got empty user input")

            command_split = user_input.split(" ")
            command = command_split[0]
            if command not in self.valid_main_commands:
                printLogToConsole(self.console_log_level,
                                  "{} is not a valid command".format(command),
                                  logging.INFO,
                                  logger=self.logger)
            elif command == "e":
                if self.confirmAction():
                    return
            else:
                action = self.valid_main_commands[command]["action"]
                if self.valid_main_commands[command]["required"]:
                    if len(command_split) != 2:
                        print("Invalid arguments passed to {}".format(command))
                    else:
                        action(command_split[1])
                else:
                    action()
                self.logger.debug(
                    "Finished running command {}".format(command))
    def _makeAmbiguousPairs(self, ambiguous_papers, check_authors, authors_to_get):
        printLogToConsole(self.console_log_level, "Creating pairs for ambiguous authors", logging.INFO)
        self.logger.info("Creating pairs for ambiguous authors")

        known_author_info, error_authors, error_papers = self._getAuthorInfos(authors_to_get)
        if error_authors > 0:
            self.logger.warning("{} errors getting known author infos".format(error_authors))
        if error_papers > 0:
            self.logger.warning("{} errors getting known author papers".format(error_papers))

        self.logger.debug("{} known papers".format(len(known_author_info)))
        self.logger.debug("{} ambiguous author ids".format(len(check_authors)))
        results = defaultdict(list)
        excluded = defaultdict(list)
        for a in ambiguous_papers.keys():
            printLogToConsole(self.console_log_level, "Creating pairs for {}".format(a), logging.INFO)
            self.logger.info("Creating pairs for {}".format(a))
            self.logger.debug("{} has {} papers".format(a, len(ambiguous_papers[a])))
            self.logger.debug("{} has {} to check against".format(a, len(check_authors[a])))
            self.logger.debug("{} has {} total possible pairs".format(a, len(ambiguous_papers) * len(check_authors[a])))

            known_to_use = [[" ".join(x), known_author_info[" ".join(x)]] for x in check_authors[a]]
            for p in ambiguous_papers[a]:
                ambiguous_paper_info = getAuthorInfo([self.papers[p], a])
                pairs_to_use, pairs_excluded = self._makePairs(ambiguous_paper_info, known_to_use)
                self.logger.debug("{} {} has {} pairs".format(p, a, len(pairs_to_use)))
                self.logger.debug("{} {} has {} excluded".format(p, a, len(pairs_excluded)))
                results[" ".join([p, a])] = pairs_to_use

                excluded[" ".join([p, a])] = [x[0] for x in pairs_excluded]

        return results, excluded
Beispiel #4
0
    def save(self):
        path = os.getcwd() + self.model_save_path + self.model_name
        if not os.path.exists(path):
            os.mkdir(path)

        # I got permission denied when using os.path.join
        with open(path + "/model.pickle", "wb") as f:
            pickle.dump(self.model, f)

        parameters_dict = {
            "classifiers": self.classifiers,
            "classifier_weights": self.classifier_weights,
            "classifier_params": self.classifier_params,
            "special_only": self.special_only,
            "test_fraction": self.test_fraction,
            "rand_seed": self.rand_seed,
            "diff_same_ratio": self.dif_same_ratio,
            "cutoff": self.cutoff
        }
        with open(path + "/parameters.json", "w") as f:
            json.dump(parameters_dict, f, indent=4)

        printLogToConsole(self.console_log_level,
                          "Saved model {} to {}".format(self.model_name,
                                                        path), logging.INFO)
        self.logger.info("Saved model {} to {}".format(self.model_name, path))
    def _getAuthorInfos(self, authors) -> (dict, int, int):
        out = {}
        printLogToConsole(self.console_log_level, "Getting author info for specified authors", logging.INFO)
        self.logger.info("Getting author info for specified authors")
        self.logger.debug("authors={}".format(authors))
        error_authors = 0
        error_papers = 0
        pbar = tqdm(total=len(authors), file=sys.stdout)
        for a in authors:
            if a not in self.author_papers:
                pbar.update()
                self.logger.warning("{} is not in self.author_papers".format(a))
                error_authors += 1
                continue
            for p in self.author_papers[a]:
                if p not in self.papers:
                    self.logger.debug("{} not in self.papers".format(p))
                    error_papers += 1

                    continue
                auth_key, auth_info = getAuthorInfo([self.papers[p], a])
                out[auth_key] = auth_info
            pbar.update()
        pbar.close()
        self.logger.debug("len(out)={}".format(len(out)))
        self.logger.debug("error_authors={}".format(error_authors))
        self.logger.debug("error_papers={}".format(error_papers))
        return out, error_authors, error_papers
 def _getAuthorInfo(self, a):
     self.logger.debug("Getting info for {}".format(a))
     name = cleanName(
         remove_weird_notes.sub(" ",
                                nameFromDict(self.id_to_name[a])).replace(
                                    "  ", " ")).replace("  ", " ")
     printLogToConsole(self.console_log_level,
                       "id={}".format(a),
                       logging.INFO,
                       logger=self.logger)
     printLogToConsole(self.console_log_level,
                       "name={}".format(name),
                       logging.INFO,
                       logger=self.logger)
     printLogToConsole(self.console_log_level,
                       "Papers for {}:".format(a),
                       logging.INFO,
                       logger=self.logger)
     for p in self.author_papers[a]:
         if p not in self.papers:
             continue
         try:
             title = self.papers[p].title
         except:
             title = self.papers[p]["title"]
         printLogToConsole(self.console_log_level,
                           "\t{}\t{}".format(p, title),
                           logging.INFO,
                           logger=self.logger)
     printLogToConsole(self.console_log_level,
                       "{} Author(s) have this name".format(
                           len(self.names[name])),
                       logging.INFO,
                       logger=self.logger)
 def _displayOverride(self):
     printLogToConsole(self.console_log_level,
                       "Override Authors: ",
                       logging.INFO,
                       logger=self.logger)
     for k in self.override_authors.keys():
         printLogToConsole(self.console_log_level,
                           "{} has {} authors to compare with".format(
                               k, len(self.override_authors[k])),
                           logging.INFO,
                           logger=self.logger)
 def _clearAuthorOverride(self, a):
     self.logger.debug("Received clear override command".format(a))
     if a not in self.override_authors:
         printLogToConsole(
             self.console_log_level,
             "{} does not have authors to compare with".format(a),
             logging.INFO,
             logger=self.logger)
         return
     if self.confirmAction():
         del self.override_authors[a]
         self.logger.debug("Removed {} from override_authors".format(a))
Beispiel #9
0
    def createModel(self, classifier_parameters):
        self.classifier_params = classifier_parameters
        printLogToConsole(self.console_log_level, "Creating model",
                          logging.INFO)
        self.logger.info("Creating model")
        self.logger.debug("{} estimators".format(len(self.classifiers)))
        weights = []
        for n, m in self.classifiers:
            self.logger.debug("n={}".format(n))
            self.logger.debug("m={}".format(m))
            weights.append(self.classifier_weights[n])
            if n not in classifier_parameters:
                self.logger.error(
                    "{} is not in classifier_parameters".format(n))
                raise KeyError("{} is not in classifier_parameters".format(n))
            if m == "GaussianNB":
                self.estimators.append(
                    (n, GaussianNB(**classifier_parameters[n])))
            elif m == "KNeighborsClassifier":
                self.estimators.append(
                    (n, KNeighborsClassifier(**classifier_parameters[n])))
            elif m == "MLPClassifier":
                self.estimators.append(
                    (n, MLPClassifier(**classifier_parameters[n])))
            elif m == "SVC":
                self.estimators.append((n, SVC(**classifier_parameters[n])))
            elif m == "RBF":
                self.estimators.append((n, RBF(**classifier_parameters[n])))
            elif m == "RandomForestClassifier":
                self.estimators.append(
                    (n, RandomForestClassifier(**classifier_parameters[n])))
            elif m == "AdaBoostClassifier":
                self.estimators.append(
                    (n, AdaBoostClassifier(**classifier_parameters[n])))
            elif m == "QuadraticDiscriminantAnalysis":
                self.estimators.append((n,
                                        QuadraticDiscriminantAnalysis(
                                            **classifier_parameters[n])))
            elif m == "DecisionTreeClassifier":
                self.estimators.append(
                    (n, DecisionTreeClassifier(**classifier_parameters[n])))
            elif m == "GaussianProcessClassifier":
                self.estimators.append(
                    (n, GaussianProcessClassifier(**classifier_parameters[n])))
            else:
                self.logger.error("Unknown classifier")
                raise ValueError("{} is not a supported classifier".format(m))

        self.model = VotingClassifier(self.estimators,
                                      voting=self.voting,
                                      weights=weights)
Beispiel #10
0
    def trainModel(self, voting="hard"):
        printLogToConsole(self.console_log_level, "Training model",
                          logging.INFO)
        self.logger.info("Training model")

        if self.special_only:
            self.logger.debug("Training on special data")
            self.logger.debug("Test cases: {}".format(
                len(self.special_train["X"])))
            train = self.special_train
        else:
            self.logger.debug("Training on all data")
            self.logger.debug("Test cases: {}".format(len(self.train["X"])))
            train = self.train

        X = train["X"]
        Y = train["Y"]
        if self.train_all_estimators:
            for n, m in self.estimators:
                t0 = time.time()
                m.fit(X, Y)
                t1 = time.time()
                progress_str = "Finished fitting classifier {} in {:.2f}s".format(
                    n, t1 - t0)
                printLogToConsole(self.console_log_level, progress_str,
                                  logging.INFO)
                self.logger.info(progress_str)
        printLogToConsole(self.console_log_level,
                          "Fitting the VotingClassifier Model", logging.INFO)
        self.logger.info("Fitting the VotingClassifier Model")
        self.model.fit(X, Y)
        printLogToConsole(self.console_log_level, "Finished fitting model",
                          logging.INFO)
        self.logger.info("Finished fitting model")
Beispiel #11
0
 def fillData(self):
     printLogToConsole(self.console_log_level, "Adding rest of data to new author papers", logging.INFO)
     self.logger.info("Adding rest of data to new author papers")
     skipped_papers = 0
     # auth_pbar = tqdm(total=len(self.author_papers), file=sys.stdout)
     # skipped_old_ids = 0
     # skipped_author_papers = 0
     # for a in self.author_papers.keys():
     #     if a in self.new_author_papers:
     #         if self.new_author_papers[a] != self.author_papers[a]:
     #             papers_to_add = [x for x in self.author_papers[a] if x not in self.new_author_papers[a]]
     #             self.new_author_papers[a].extend(papers_to_add)
     #             self.logger.debug("{} is in new author papers, but need to add {} papers".format(a,len(papers_to_add)))
     #
     #         else:
     #             skipped_author_papers+=1
     #         # self.logger.debug("Skipping author {}, in new_author_papers".format(a))
     #     elif a in self.old_ids:
     #         if self.remove_all_papers:
     #             skipped_old_ids+=1
     #         # self.logger.debug("Skipping author {}, in old_ids".format(a))
     #     else:
     #         if a not in self.id_to_name:
     #             self.logger.warning("{} is in author_papers but not in id_to_name".format(a))
     #         else:
     #             self.new_author_papers[a] = deepcopy(self.author_papers[a])
     #             self.new_id_to_name[a] = self.id_to_name[a]
     #     auth_pbar.update()
     # auth_pbar.close()
     # self.logger.debug("Skipped {} authors due to being in new_author_papers".format(skipped_author_papers))
     # self.logger.debug("Skipped {} authors due to being in old_ids".format(skipped_old_ids))
     printLogToConsole(self.console_log_level, "Adding papers", logging.INFO, logger=self.logger)
     paper_pbar = tqdm(total=len(self.papers), file=sys.stdout)
     for pid, paper in self.papers.items():
         if pid in self.error_papers:
             self.logger.debug("{} is in error_papers, but not in self.new_papers".format(pid))
         else:
             if pid in self.new_papers:
                 paper = self.new_papers[pid]
             else:
                 self.new_papers[pid] = paper
             for a in paper.affiliations.keys():
                 if a not in self.new_id_to_name:
                     self.new_id_to_name[a] = self.id_to_name[a]
                 if pid not in self.new_author_papers[a]:
                     self.new_author_papers[a].append(pid)
         paper_pbar.update()
     paper_pbar.close()
     return self.new_papers, self.new_author_papers, self.new_id_to_name
    def _makePredictions(self, author_arrays):
        printLogToConsole(self.console_log_level, "Predicting same authors", logging.INFO)
        self.logger.info("Predicting same authors")
        predictions = defaultdict(dict)
        probabilities = defaultdict(dict)
        pbar = tqdm(total=len(author_arrays), file=sys.stdout)
        for target, info in author_arrays.items():
            pbar.write("INFO: Predicting same authors to {}".format(target))
            self.logger.info("Predicting same authors to {}".format(target))
            for author, results in info.items():
                self.logger.debug("Making predictions for {}".format(author))
                predictions[target][author] = self.model.predict(results).tolist()
                try:
                    probabilities[target][author] = self.model.predict_proba(results).tolist()
                except:
                    self.logger.warning("Could not get probabilities for {} - {} ".format(target, author))
            pbar.update()
        pbar.close()

        return predictions, probabilities
 def _genAuthorOverride(self, a):
     self.logger.debug("Received clear override command".format(a))
     if a in self.override_authors:
         printLogToConsole(
             self.console_log_level,
             "{} already has authors to compare with".format(a),
             logging.INFO,
             logger=self.logger)
         return
     elif a not in self.targets:
         printLogToConsole(self.console_log_level,
                           "{} is not a target".format(a),
                           logging.INFO,
                           logger=self.logger)
         return
     name = cleanName(
         remove_weird_notes.sub(" ",
                                nameFromDict(self.id_to_name[a])).replace(
                                    "  ", " ")).replace("  ", " ")
     print("INFO: Other authors with the same name:")
     for other_a in self.names[name]:
         if other_a != a:
             print("INFO: {}".format(other_a))
     if len(self.names[name]) == 1:
         printLogToConsole(
             self.console_log_level,
             "{} only has {}, will not add authors to compare with".format(
                 name, a),
             logging.INFO,
             logger=self.logger)
     else:
         self.override_authors[a] = [x for x in self.names[name] if x != a]
         self.logger.debug("{} authors added to override_authors".format(
             len(self.names[name])))
    def __call__(self, target_authors, override_authors=None, evaluation_mode=False):
        if not override_authors:
            override_authors = {}
        override_authors_len = len(override_authors)
        self.logger.debug("__call__ called with arguments: ")
        self.logger.debug("\tlen(target_authors)={}".format(len(target_authors)))
        self.logger.debug("\tlen(override_authors)={}".format(len(override_authors)))
        printLogToConsole(self.console_log_level, "Starting Disambiguation", logging.INFO)
        self.logger.info("Starting Disambiguation")

        has_authors, needs_authors = self._errorCheckCallArgs(target_authors, override_authors)
        ambiguous_authors_res = self._makeAmbiguousAuthors(has_authors, needs_authors, override_authors)
        ambiguous_papers, ambiguous_names, check_authors, authors_to_get, excluded_authors = ambiguous_authors_res
        self.logger.debug("{} authors had no similar authors".format(len(excluded_authors)))

        ambiguous_papers_to_use = {x: ambiguous_papers[x] for x in ambiguous_papers if x not in excluded_authors}
        to_compare, excluded = self._makeAmbiguousPairs(ambiguous_papers_to_use, check_authors, authors_to_get)

        # initialize it here so that even if not using self.same_paper_diff_people, it can run without any errors
        known_different = {}
        if self.same_paper_diff_people:
            self.logger.debug("Removing excluded")
            to_compare, known_different = self._removeKnownDifferent(to_compare, excluded)

        compare_results = self._compareAmbiguousPairs(to_compare)
        compare_results = self._consolidateResults(compare_results)
        predictions, probabilities = self._makePredictions(compare_results)

        if self.use_probabilities:
            to_use = {}
            for k, info in probabilities.items():
                to_use[k] = {x: [y[1] for y in info[x]] for x in info.keys()}
        else:
            to_use = predictions

        warning_auth = []
        correct_dict = defaultdict(dict)
        printLogToConsole(self.console_log_level, "Determining the correct author", logging.INFO)
        self.logger.info("Determining the correct author")
        pbar = tqdm(total=len(predictions), file=sys.stdout)
        for k, pred in to_use.items():
            self.logger.debug("{}")
            correct, above_thres = self._determineCorrectAuthor(pred, evaluation_mode)
            correct_dict[k]["same"] = correct
            correct_dict[k]["different"] = [x for x in pred.keys() if x != correct]
            self.logger.debug("{} was determined to be the same as {}".format(k, correct))
            if evaluation_mode:
                correct_dict[k]["percent_same"] = above_thres
            if len(above_thres) != 1 and not evaluation_mode:
                self.logger.debug("Added {} to warnings".format(k))
                warning_auth.append([k, above_thres])
            correct_dict[k]["papers_affected"] = ambiguous_papers[k]
            pbar.update()
        pbar.close()
        printLogToConsole(self.console_log_level, "Writing results to results.json", logging.INFO)
        self.logger.info("Writing results to results.json")
        with open("results.json", "w") as f:
            json.dump(correct_dict, f, indent=4, sort_keys=True)

        return correct_dict
    def _consolidateResults(self, compare_results):
        printLogToConsole(self.console_log_level, "Consolidating Compare results", logging.INFO)
        self.logger.info("Consolidating Compare results")
        out = {}
        pbar = tqdm(total=len(compare_results), file=sys.stdout)
        for k, results in compare_results.items():
            pid, k_id = k.split(" ")
            if k_id not in out:
                out[k_id] = defaultdict(list)
            for _id, id_results in results.items():
                out[k_id][_id].extend(id_results)
            pbar.update()
        pbar.close()
        self.logger.debug("Converting to np arrays")
        author_compare_results = {}
        for author, info in out.items():
            author_results = {}
            self.logger.debug("Converting {} results".format(author))
            for other_id, compare_arrays in info.items():
                self.logger.debug("Consolidating results from {}".format(other_id))
                if any([1 for x in compare_arrays if len(x) != self.compare_terms]):
                    self.logger.error(
                        "A compare result from {}-{} does not have the correct number of terms".format(author,
                                                                                                       other_id))
                    self.logger.error("Lengths are: {}".format([len(x) for x in compare_results]))
                    self.logger.error("Expected length is: {}".format(self.compare_terms))
                    raise ValueError("Compare results length does not match comparator's result length")
                try:
                    author_results[other_id] = np.array(compare_arrays)
                except Exception as e:
                    self.logger.warning(
                        "Ran into exception {} when converting compare_results, trying array by array".format(e))
                    tmp_arrays = []
                    for a in compare_results:
                        tmp_arrays.append(np.asarray(a))
                    author_results[other_id] = np.asarray(tmp_arrays).reshape(len(compare_arrays), self.compare_terms)

            author_compare_results[author] = author_results
        return author_compare_results
    def _prepareData(self, separated, paper_auth_info, algorithm):
        special_cases_dict = self._getSpecialCases(separated)
        same = []
        different = []
        special_same = []
        special_diff = []
        sorted_keys = sorted(list(separated.keys()))

        for k in sorted_keys:
            info = separated[k]
            printLogToConsole(
                self.console_log_level,
                "Creating pairs for authors starting with {}".format(k),
                logging.INFO)
            self.logger.log(
                logging.INFO,
                "Creating pairs for authors starting with {}".format(k))
            special_cases = None
            if k in special_cases_dict:
                special_cases = special_cases_dict[k]

            auth_info = [(x, paper_auth_info[x]) for x in info]
            tmp_same, tmp_diff = self._makeCombinations(
                auth_info, special_cases)
            gc.collect()
            self.logger.debug("{} pairs to add to same".format(len(tmp_same)))
            self.logger.debug("{} pairs to add to different".format(
                len(tmp_diff)))
            same.extend([[1, p] for p in tmp_same])
            different.extend([[0, p] for p in tmp_diff])
            self.logger.debug("{} same pairs".format(len(same)))
            self.logger.debug("{} different pairs".format(len(different)))
        printLogToConsole(self.console_log_level, "Handling special cases",
                          logging.INFO)
        self.logger.log(logging.INFO, "Handling special cases")
        for k, info in special_cases_dict.items():
            printLogToConsole(
                self.console_log_level,
                "Creating pairs for special cases that start with {}".format(
                    k), logging.INFO)
            self.logger.log(
                logging.INFO,
                "Creating pairs for special cases that start with {}".format(
                    k))
            auth_info = [(x, paper_auth_info[x]) for x in info]
            tmp_same, tmp_diff = self._makeCombinations(auth_info,
                                                        algorithm,
                                                        use_cutoff=False)
            special_same.extend([[1, p] for p in tmp_same])
            special_diff.extend([[0, p] for p in tmp_diff])

        return same, different, special_same, special_diff
    def _selectPairsToUse(self, same, diff):
        printLogToConsole(self.console_log_level, "Selecting pairs to use",
                          logging.DEBUG)
        self.logger.log(logging.INFO, "Selecting pairs to use")
        self.logger.debug("len(same) -> {}".format(len(same)))
        self.logger.debug("len(different) -> {}".format(len(diff)))
        len_same = len(same)
        len_diff = len(diff)
        if len_same > len_diff:
            pair_count = int(len_diff * self.dif_same_ratio)
        else:
            pair_count = int(len_same * self.dif_same_ratio)
        self.logger.debug("pair_count -> {}".format(pair_count))

        if self.pair_distribution == "similarity":
            printLogToConsole(self.console_log_level,
                              "Using similarity distribution", logging.INFO)
            self.logger.log(logging.INFO, "Using similarity distribution")
            printLogToConsole(
                self.console_log_level,
                "Similarity distribution is not implemented yet",
                logging.CRITICAL)
            self.logger.log(logging.ERROR,
                            "Similarity distribution is not implemented yet")
            # TODO: Implement similarity distribution
            raise ValueError("Similarity distribution is not implemented yet")
        elif self.pair_distribution == "random":
            printLogToConsole(self.console_log_level, "Using random selection",
                              logging.INFO)
            self.logger.log(logging.INFO, "Using random selection")
            try:
                out_same = random.sample(same, pair_count)
            except:
                out_same = same[:pair_count]
            try:
                out_diff = random.sample(diff, pair_count)
            except:
                out_diff = diff[:pair_count]
            return out_same, out_diff
Beispiel #18
0
    def _createTrainTest(self, data):
        printLogToConsole(self.console_log_level, "Creating train and test",
                          logging.INFO)
        self.logger.info("Creating train and test")
        same, different, special_same, special_different = self._parseData(
            data)

        def saveData(d, file_path):
            with open(file_path, "wb") as f:
                to_save = [x for x in d]
                pickle.dump(to_save, f)

        if self.save_data:
            saveData(same, self.save_path + "/same.pickle")
            saveData(different, self.save_path + "/different.pickle")
            saveData(special_same, self.save_path + "/special_same.pickle")
            saveData(special_different,
                     self.save_path + "/special_different.pickle")
            all_pairs = []
            for k, t, _ in [
                    *same, *different, *special_same, *special_different
            ]:
                all_pairs.append([k, t])
            with open(self.save_path + "/save_pairs.pickle", "wb") as f:
                pickle.dump(all_pairs, f)

        same = self.convertToUsable(same)
        different = self.convertToUsable(different)
        special_same = self.convertToUsable(special_same)
        special_different = self.convertToUsable(special_different)
        same, different = self._selectPairsToUse(same, different)
        special_same, special_different = self._selectPairsToUse(
            special_same, special_different)
        train, test = self._splitTrainTest(same, different, special_same,
                                           special_different)
        special_train, special_test = self._splitTrainTest(
            special_same, special_different)

        printLogToConsole(self.console_log_level,
                          "Splitting non-special pairs", logging.INFO)
        self.logger.info("Splitting non-special pairs")

        printLogToConsole(self.console_log_level, "Splitting special pairs",
                          logging.INFO)
        self.logger.info("Splitting special pairs")

        return train, test, special_train, special_test
    def __call__(self,
                 pairs_to_use=None,
                 authors_to_use=None,
                 debug_retrieve_info=None,
                 get_info_all=False,
                 debug_asserts=False):
        gc.collect()
        total_run_start = time.time()
        if pairs_to_use and authors_to_use:
            self.logger.warning(
                "Both pairs_to_use and authors_to_use were passed, pairs_to_use will override authors_to_use"
            )
        if authors_to_use is None:
            authors_to_use = []
        override_pairs_to_use = False
        if pairs_to_use is None:
            pairs_to_use = []
            override_pairs_to_use = True
        tasks, out, ignored, excluded = self._populateConstants()
        results = []
        paper_auth_info = {}
        """
        Initialize data
        """
        printLogToConsole(self.console_log_level, "Getting author info",
                          logging.INFO)
        self.logger.log(logging.INFO, "Getting author info")
        below_cutoff = 0
        with tqdm(total=tasks, file=sys.stdout) as pbar:
            for i in out:
                add_author = False
                if authors_to_use and i[1] in authors_to_use:
                    add_author = True
                elif len(self.valid_author_papers[i[1]]
                         ) >= self.author_cutoff or i[1] in self.special_keys:
                    add_author = True
                elif len(self.valid_author_papers[i[1]]) < self.author_cutoff:
                    below_cutoff += 1
                if i[1] in self.special_keys:
                    add_author = True

                if add_author:
                    pair_key, res = getAuthorInfo(i)
                    # results.append((pair_key, res))
                    paper_auth_info[pair_key] = res
                pbar.update()
            pbar.close()
        self.logger.debug("{} Authors below cutoff".format(below_cutoff))
        """
        Separate the authors by the first char(s) that appear in their ids, to reduce the number of pointless pairs
        """
        printLogToConsole(self.console_log_level,
                          "Separating keys by chars in name", logging.INFO)
        self.logger.log(logging.INFO, "Separating keys by chars in name")
        separated_keys = self._createPairDict(list(paper_auth_info.keys()),
                                              self.separate_chars,
                                              self.separate_words)
        """
        Create the pairs needed and put the special cases into their own arrays, then select the pairs to use based 
        on the ratio defined in initialization to ensure a controlled number of same to different. This DOES NOT 
        occur to any special cases. If pairs_to_use is defined, skip and use predefined pairs
        """
        if not pairs_to_use:
            printLogToConsole(self.console_log_level, "Creating pairs",
                              logging.INFO)
            self.logger.log(logging.INFO, "Creating pairs")
            same, diff, special_same, special_diff = self._prepareData(
                separated_keys, paper_auth_info, self.algorithm)
            self.logger.debug("len(same) = {}".format(len(same)))
            self.logger.debug("len(different) = {}".format(len(diff)))
            self.logger.debug("len(special_same) = {}".format(
                len(special_same)))
            self.logger.debug("len(special_different) = {}".format(
                len(special_diff)))
            if not get_info_all:
                self.logger.debug("Splitting pairs")
                same, diff = self._selectPairsToUse(same, diff)
            else:
                self.logger.debug("Getting all info")

            pairs_to_use = [*same, *diff, *special_same, *special_diff]
        else:
            printLogToConsole(self.console_log_level, "Using passed pairs",
                              logging.INFO)
            self.logger.log(logging.INFO, "Using passed pairs")
            same = []
            diff = []
            special_same = []
            special_diff = []
            for t, pair_data in pairs_to_use:
                is_special = False
                for special_case in self.special_keys:
                    if special_case in pair_data[
                            1] or special_case in pair_data[2]:
                        is_special = True
                if t == 1:
                    if is_special:
                        special_same.append([t, pair_data])
                    else:
                        same.append([t, pair_data])
                elif t == 0:
                    if is_special:
                        special_diff.append([t, pair_data])
                    else:
                        diff.append([t, pair_data])
        """
        Take the pairs and get the info needed for them. This is done here in order to save runtime memory
        """
        to_use = []
        printLogToConsole(self.console_log_level, "Retrieving info for pairs",
                          logging.INFO)
        self.logger.log(logging.INFO, "Retrieving info for pairs")
        for p in pairs_to_use:
            try:
                tag, pair_info = p
            except ValueError as e:
                self.logger.error(
                    "Error raised when retrieving info for pairs")
                self.logger.error(
                    "Issue with value unpacking for p when iterating over pairs_to_use. expected 2 got {}"
                    .format(len(p)))
                self.logger.error("p: {}".format(p))
                self.logger.exception(e)
                raise e  # just to make warnings stop

            try:
                key, a, b = pair_info
            except ValueError as e:
                self.logger.error("Error raised when unpacking pair info")
                self.logger.error(
                    "Issue with value unpacking for pair_info. expected 3 got {}"
                    .format(len(pair_info)))
                self.logger.error("pair_info: {}".format(pair_info))
                self.logger.exception(e)
                raise e
            to_use.append([key, tag, paper_auth_info[a], paper_auth_info[b]])
        if debug_retrieve_info:
            return to_use
        random.shuffle(to_use)
        """
        Compare those pairs, print out result stats, and pickle the data
        """
        comparator = CompareAuthors(**self.compare_args)
        printLogToConsole(self.console_log_level, "Comparing authors",
                          logging.INFO)
        self.logger.log(logging.INFO, "Comparing authors")
        if self.cores == 1 or len(to_use) < 20000:
            pbar = tqdm(total=len(to_use), file=sys.stdout)
            for i in to_use:
                results.append(comparator(i))
                pbar.update()
            pbar.close()
        else:
            printLogToConsole(
                self.console_log_level,
                "Comparing {} pairs in parallel".format(len(to_use)),
                logging.INFO)
            self.logger.info("Comparing {} pairs in parallel".format(
                len(to_use)))

            batches = chunks(to_use, self.compare_batch_size)
            batch_count = len(to_use) // self.compare_batch_size
            if len(to_use) % self.compare_batch_size != 0:
                batch_count += 1
            self.logger.debug("{} batches".format(batch_count))

            with mp.Pool(self.cores) as Pool:
                imap_results = list(
                    tqdm(Pool.imap_unordered(comparator.processBatch, batches),
                         total=batch_count,
                         file=sys.stdout))

            self.logger.debug("Combining results from pool")
            for res in imap_results:
                results.extend(res)
        total_run_end = time.time()
        hours, rem = divmod(total_run_end - total_run_start, 3600)
        minutes, seconds = divmod(rem, 60)
        stats = [
            ["Total Pairs Used", len(to_use)],
            ["Same", len(same)],
            ["Different", len(diff)],
            ["Special Same", len(special_same)],
            ["Special Different", len(special_diff)],
        ]
        printStats("Results", stats, line_adaptive=True)
        printLogToConsole(
            self.console_log_level,
            "Total Run time: {:0>2}:{:0>2}:{:05.2f}".format(
                int(hours), int(minutes), seconds), logging.INFO)
        self.logger.info("Total Run Time: {:0>2}:{:0>2}:{:05.2f}".format(
            int(hours), int(minutes), seconds))
        if self.save_data:
            printLogToConsole(self.console_log_level,
                              "Writing author_papers.json",
                              logging.INFO,
                              logger=self.logger)
            with open(self.json_path + "/author_papers.json", "w") as f:
                json.dump(self.all_author_papers, f, indent=4, sort_keys=True)
            printLogToConsole(self.console_log_level,
                              "Pickling results",
                              logging.INFO,
                              logger=self.logger)

            with open(self.pickle_path + "/tagged_pairs.pickle", "wb") as f:
                pickle.dump(results, f)
 def _removeTarget(self):
     self.logger.debug("Received remove command")
     print(
         "INFO: Select the number of the id you would like to remove from targets, enter e to exit"
     )
     if len(self.targets) == 0:
         printLogToConsole(self.console_log_level,
                           "No possible targets to remove",
                           logging.INFO,
                           logger=self.logger)
         return
     while 1:
         for i, v in enumerate(self.targets):
             print("INFO: [{}] {}".format(i, v))
         to_remove = input(">>")
         if to_remove == "e":
             self.logger.debug("Exit command received")
             return
         try:
             to_remove = int(to_remove)
         except ValueError:
             printLogToConsole(self.console_log_level,
                               "{} is not valid".format(to_remove),
                               logging.INFO,
                               logger=self.logger)
             continue
         if to_remove < 0:
             printLogToConsole(self.console_log_level,
                               "{} is not valid".format(to_remove),
                               logging.INFO,
                               logger=self.logger)
             continue
         elif to_remove >= len(self.targets):
             printLogToConsole(self.console_log_level,
                               "{} is not valid".format(to_remove),
                               logging.INFO,
                               logger=self.logger)
             continue
         else:
             printLogToConsole(self.console_log_level,
                               "{} is selected to be removed".format(
                                   self.targets[to_remove]),
                               logging.INFO,
                               logger=self.logger)
             if self.confirmAction():
                 self.targets.remove(self.targets[to_remove])
                 printLogToConsole(self.console_log_level,
                                   "Remaining targets:",
                                   logging.INFO,
                                   logger=self.logger)
                 for i, v in enumerate(self.targets):
                     print("INFO: {}".format(v))
                 return
             else:
                 self.logger.debug("User did not confirm action")
    def _addTarget(self, a):
        self.logger.debug("Received add command with arguments {}".format(a))
        valid_author = self._validAuthor(a)
        if valid_author == -1:
            printLogToConsole(self.console_log_level,
                              "{} is not a valid author id".format(a),
                              logging.INFO,
                              logger=self.logger)
        elif valid_author == -2:
            printLogToConsole(self.console_log_level,
                              "{} has no parsed papers".format(a),
                              logging.INFO,
                              logger=self.logger)
        else:
            self._getAuthorInfo(a)
            self.targets.append(a)

            if a not in self.override_authors:
                printLogToConsole(
                    self.console_log_level,
                    "{} does not have specified authors to compare with".
                    format(a),
                    logging.INFO,
                    logger=self.logger)
            else:
                printLogToConsole(self.console_log_level,
                                  "{} has {} authors to compare with".format(
                                      a, len(self.override_authors[a])),
                                  logging.INFO,
                                  logger=self.logger)
            printLogToConsole(self.console_log_level,
                              "{} added to targets".format(a),
                              logging.INFO,
                              logger=self.logger)
            printLogToConsole(self.console_log_level,
                              "{} current targets".format(len(self.targets)),
                              logging.INFO,
                              logger=self.logger)

        return
    def __init__(self, papers=None, author_papers=None, compare_args=None, id_to_name=None,
                 console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None,
                 save_data=False, ext_directory=False, save_path=None, threshold=.2, name_similarity_cutoff=.92,
                 str_algorithm="jaro-similarity", model=None, model_name="VC1", model_path=None,
                 create_new_author=False, compare_cutoff=3, tie_breaker="max", cores=4, DEBUG_MODE=False,
                 sim_overrides=False, allow_authors_not_in_override=True, same_paper_diff_people=True, use_probabilities=False):
        if not log_format:
            log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s'
        if not log_path:
            log_path = os.getcwd() + "/logs/disambiguation.log"
        self.logger = createLogger("author_disambiguation", log_path, log_format, console_log_level, file_log_level)
        self.console_log_level = console_log_level
        self.model = model
        self.model_name = model_name
        if self.model is None:
            if not model_path:
                model_path = os.getcwd()
            self.model = pickle.load(open("{}/models/{}/model.pickle".format(model_path, model_name), "rb"))
        try:
            if self.model.voting == "hard" and use_probabilities:
                self.logger.warning("hard voting does not support probabilities")
                self.use_probabilities = False
            else:
                self.use_probabilities = use_probabilities
        except Exception as e:
            self.logger.debug("model does not have voting")
            self.use_probabilities = False

        if not DEBUG_MODE:
            # Argument validation
            if compare_args and not isinstance(compare_args, dict):
                self.logger.error("passed compare_args is not valid")
                self.logger.exception(TypeError("compare_args is not a dict"))
                raise TypeError("compare_args is not a dict")
            elif not compare_args:
                self.logger.error("passed compare_args is not valid")
                self.logger.exception(ValueError("compare_args is None"))
                raise ValueError("compare_args is None")
            else:
                self.compare_args = compare_args

            if author_papers and (not isinstance(author_papers, dict) and not isinstance(author_papers, defaultdict)):
                self.logger.error("passed author_papers is not valid")
                self.logger.error("type is {}".format(type(author_papers)))
                self.logger.exception(TypeError("author_papers is not a dict"))
                raise TypeError("author_papers is not a dict")
            elif not author_papers:
                author_papers, status, error_msg = self._findData("author_papers.json")
                if status != 0:
                    self.logger.error(
                        "passed author_papers is not valid and could not find the file author_papers.json")
                    self.logger.error("self._findData(\"author_papers.json\") returned error {}".format(error_msg))
                    self.logger.exception(ValueError("No valid author_papers found"))
                    raise ValueError("No valid author_papers found")
                else:
                    self.author_papers = deepcopy(author_papers)
            else:
                self.author_papers = deepcopy(author_papers)

            if papers and not isinstance(papers, dict):
                self.logger.error("passed papers is not valid")
                self.logger.exception(TypeError("papers is not a dict"))
                raise TypeError("papers is not a dict")
            elif not papers:
                papers, status, error_msg = self._findData("parsed_papers.json")
                if status != 0:
                    self.logger.error("passed papers is not valid and could not find the file parsed_papers.json")
                    self.logger.error("self._findData(\"parsed_papers.json\") returned error {}".format(error_msg))
                    self.logger.exception(ValueError("No valid parsed_papers found"))
                    raise ValueError("No valid parsed_papers found")
                else:
                    if len(papers) == 0:
                        self.logger.exception(ValueError("Found papers is empty"))
                        raise ValueError("Found papers is empty")
                    self.logger.debug("Converting papers from dict to Paper object")
                    self.papers = {}
                    for k, info in papers.items():
                        self.papers[k] = Paper(**info)

            else:
                if len(papers) == 0:
                    self.logger.exception(ValueError("Passed papers is empty"))
                    raise ValueError("Passed papers is empty")
                test_key = list(papers.keys())[0]
                if isinstance(test_key, dict):
                    self.papers = {}
                    for k, info in papers.items():
                        try:
                            self.papers[k] = Paper(**info)
                        except Exception as e:
                            self.logger.error("Exception raised when converting paper dicts to Paper")
                            self.logger.error("k={}".format(k))
                            self.logger.error("info={}".format(info))
                            self.logger.exception(e)
                            raise e
                else:
                    self.papers = papers

            if id_to_name and not isinstance(id_to_name, dict):
                self.logger.error("passed id_to_name is not valid")
                self.logger.exception(TypeError("id_to_name is not a dict"))
                raise TypeError("id_to_name is not a dict")
            elif not id_to_name:
                id_to_name, status, error_msg = self._findData("id_to_name.json")
                if status != 0:
                    self.logger.error("passed id_to_name is not valid and could not find the file parsed_papers.json")
                    self.logger.error("self._findData(\"id_to_name.json\") returned error {}".format(error_msg))
                    self.logger.exception(ValueError("No valid id_to_name found"))
                    raise ValueError("No valid id_to_name found")
                else:
                    if len(id_to_name) == 0:
                        self.logger.exception(ValueError("Found id_to_name is empty"))
                        raise ValueError("Found id_to_name is empty")
                    self.id_to_name = id_to_name

            else:
                if len(id_to_name) == 0:
                    self.logger.exception(ValueError("Passed id_to_name is empty"))
                    raise ValueError("Passed id_to_name is empty")
                self.id_to_name = id_to_name
        else:
            printLogToConsole(self.console_log_level, "RUNNING IN DEBUG_MODE!", logging.WARNING)
            self.logger.warning("Running in DEBUG_MODE")
            self.id_to_name = id_to_name if id_to_name else {}
            self.papers = papers if papers else {}
            self.compare_args = compare_args if compare_args else {}
            self.author_papers = author_papers if author_papers else {}
        self.compare_terms = len(CompareAuthors.compare_terms)
        self.save_data = save_data
        self.save_dir = save_path
        self.ext_directory = ext_directory
        self.threshold = threshold
        self.name_similarity_cutoff = name_similarity_cutoff
        algo_name, measure = str_algorithm.split("-")
        self.author_name = {x: nameFromDict(self.id_to_name[x]) for x in self.id_to_name.keys()}
        self.cores = cores
        self.str_algorithm = getAlgo(algo_name, measure)
        self.create_new_author = create_new_author
        self.compare_cutoff = compare_cutoff
        self.tie_breaker = tie_breaker
        self.sim_overrides = sim_overrides
        self.allow_authors_not_in_override = allow_authors_not_in_override
        self.same_paper_diff_people = same_paper_diff_people
        self.logger.debug("AuthorDisambiguation initialized with arguments:")
        self.logger.debug("\tcompare_args={}".format(list(self.compare_args.keys())))
        self.logger.debug("\talgorithm={}".format(algo_name))
        self.logger.debug("\tmeasure={}".format(measure))
        self.logger.debug("\tthreshold={}".format(threshold))
        self.logger.debug("\tname_similarity_cutoff={}".format(name_similarity_cutoff))
        self.logger.debug("\tunique authors={}".format(len(self.author_papers)))
        self.logger.debug("\tcompare_cutoff={}".format(self.compare_cutoff))
        self.logger.debug("\ttie_breaker={}".format(self.tie_breaker))
        self.logger.debug("\tsim_overrides={}".format(self.sim_overrides))
        self.logger.debug("\tsame_paper_diff_people={}".format(self.same_paper_diff_people))
        self.logger.debug("\tuse_probabilities={}".format(self.use_probabilities))
        if self.compare_cutoff != 3:
            self.logger.warning("Non-default value for compare_cutoff, currently this is not implemented")
Beispiel #23
0
    def evaluate(self):
        printLogToConsole(self.console_log_level, "Evaluating model",
                          logging.INFO)
        self.logger.info("Evaluating model")
        if self.train_all_estimators:
            predictions = {}
            special_predictions = {}
            for n, m in self.estimators:
                self.logger.debug("Making predictions for {}".format(n))
                predictions[n] = m.predict(self.test["X"])
                special_predictions[n] = m.predict(self.special_test["X"])

            printLogToConsole(self.console_log_level,
                              "Results for all estimators", logging.INFO)
            self.logger.info("Results for all estimators")
            if not self.special_only:
                printLogToConsole(
                    self.console_log_level,
                    "First stat line is on normal test, second is for special cases",
                    logging.INFO)
            column_str = "{} {:>11} {:>11} {:>11}".format(
                " " * 25, "precision", "recall", "f1-score")
            printLogToConsole(self.console_log_level, column_str, logging.INFO)
            self.logger.info(column_str)
            for k, pred in predictions.items():
                precision, recall, _, _ = precision_recall_fscore_support(
                    self.test["Y"], pred, average="binary")
                f1 = f1_score(self.test["Y"], pred, average="binary")
                stat_str = "{:<25} {:>11.2f} {:>11.2f} {:>11.2f}".format(
                    k + ":", precision, recall, f1)
                printLogToConsole(self.console_log_level, stat_str,
                                  logging.INFO)
                self.logger.info(stat_str)
                if self.special_only:
                    continue
                precision, recall, _, _ = precision_recall_fscore_support(
                    self.special_test["Y"],
                    special_predictions[k],
                    average="binary")
                f1 = f1_score(self.special_test["Y"],
                              special_predictions[k],
                              average="binary")
                stat_str = "{:<25} {:>11.2f} {:>11.2f} {:>11.2f}".format(
                    k + ":", precision, recall, f1)
                printLogToConsole(self.console_log_level, stat_str,
                                  logging.INFO)
                self.logger.info(stat_str)

        model_predictions = self.model.predict(self.test["X"])
        printLogToConsole(self.console_log_level, "Model stats on test data:",
                          logging.INFO)
        self.logger.info("Model stats on test data")
        stats = classification_report(self.test["Y"],
                                      model_predictions,
                                      target_names=["Different", "Same"])
        print(stats)
        self.logger.info(stats)
        if not self.special_only:
            model_predictions = self.model.predict(self.special_test["X"])
            printLogToConsole(self.console_log_level,
                              "Model stats on special cases data:",
                              logging.INFO)
            self.logger.info("Model stats on special cases data")
            stats = classification_report(self.special_test["Y"],
                                          model_predictions,
                                          target_names=["Different", "Same"])
            print(stats)
            self.logger.info(stats)
    def _makeAmbiguousAuthors(self, has_authors, needs_authors, override_authors):
        ambiguous_author_papers = defaultdict(list)
        ambiguous_author_names = dict()
        authors_get_info = list()
        check_author_keys = defaultdict(list)
        excluded = []
        for i in [*has_authors, *needs_authors]:
            ambiguous_author_papers[i] = self.author_papers.pop(i)
            try:
                ambiguous_author_names[i] = cleanName(nameFromDict(self.id_to_name[i])).lower()
                del self.author_name[i]
            except KeyError as e:
                self.logger.warning("{} is not in id_to_name".format(i))
                excluded.append(i)

        for a in has_authors:
            if a in excluded:
                self.logger.debug("Skipping {} because it is in excluded".format(a))
                continue
            authors_get_info.extend(override_authors[a])
            check_author_keys[a] = self._makeCheckAuthors(override_authors[a])
        args = []
        for a in needs_authors:
            if a in excluded:
                self.logger.debug("Skipping {} because it is in excluded".format(a))
                continue
            args.append([a, ambiguous_author_names[a], self.author_name, self.str_algorithm, self.name_similarity_cutoff,
                         self.sim_overrides])
        printLogToConsole(self.console_log_level, "Getting similar authors in parallel with {} cores".format(self.cores),
                          logging.INFO)
        self.logger.info("Getting similar authors in parallel with {} cores".format(self.cores))
        sim_authors = []
        with mp.Pool(self.cores) as Pool:
            imap_results = list(tqdm(Pool.imap_unordered(self._getSimilarAuthors, args), total=len(args), file=sys.stdout))
            for target, auth, warnings, debug in imap_results:
                self.logger.debug("Adding authors from {}".format(target))
                self.logger.debug("len(auth)={}".format(len(auth)))
                sim_authors.append([target, auth])
                for i in warnings:
                    self.logger.warning(i)
                for i in debug:
                    self.logger.debug(i)
        self.logger.debug("len(sim_authors)={}".format(len(sim_authors)))

        pbar = tqdm(total=len(sim_authors), file=sys.stdout)
        for a, auths in sim_authors:

            if a in override_authors:
                self.logger.exception(ValueError("{} is in need authors, but is already in override_authors".format(a)))
                raise ValueError("{} is in need authors, but is already in override_authors".format(a))
            pbar.write("INFO: Checking similar authors to {}".format(a))
            self.logger.info("Checking similar authors to {}".format(a))
            if len(auths) == 0:
                self.logger.warning("{} has no similar authors".format(a))
                excluded.append(a)
            else:
                authors_get_info.extend(auths)
                check_author_keys[a] = self._makeCheckAuthors(auths)
                if len(check_author_keys[a]) == 0:
                    self.logger.debug("{} had at least 1 similar author, but nothing in check author keys".format(a))
            pbar.update()
        pbar.close()
        authors_get_info = list(set(authors_get_info))
        return ambiguous_author_papers, ambiguous_author_names, check_author_keys, authors_get_info, excluded
    def _makeCombinations(self, i, special_cases=None, use_cutoff=True):
        if not special_cases:
            special_cases = []
        self.logger.debug("len(special_cases)={}".format(len(special_cases)))
        infos = {x[0]: x[1] for x in i}
        keys = list(infos.keys())
        same = {}
        different = {}
        name_cutoff = self.name_similarity_cutoff if use_cutoff else 0
        combinations = []
        pair_creator_pbar = tqdm(total=int(ncr(len(keys), 2)), file=sys.stdout)
        max_size_allocate = 0
        estimated_size_to_allocate = 0
        special_cases_combos = []
        for i, a in enumerate(keys):
            for j, b in enumerate(keys[1 + i:]):
                a_paper, a_id = a.split(" ")
                b_paper, b_id = b.split(" ")
                if a_id in special_cases and b_id in special_cases:
                    special_cases_combos.append(
                        [a, b, self.algorithm, special_cases, name_cutoff])
                    continue
                combo_to_add = [
                    a, b, self.algorithm, special_cases, name_cutoff
                ]
                combo_size = sys.getsizeof(combo_to_add)
                combinations.append(combo_to_add)
                max_size_allocate = max(max_size_allocate, combo_size)
                estimated_size_to_allocate += combo_size
                pair_creator_pbar.update()
        pair_creator_pbar.close()
        self.logger.log(logging.DEBUG,
                        "{} combinations".format(len(combinations)))
        self.logger.debug("{} special combinations".format(
            len(special_cases_combos)))
        self.logger.log(
            logging.DEBUG,
            "Max combination size {} bytes".format(max_size_allocate))
        self.logger.log(
            logging.DEBUG, "Size of combinations {}".format(
                size(estimated_size_to_allocate, si)))

        printLogToConsole(self.console_log_level,
                          "Removing pairs that are not valid", logging.INFO)
        self.logger.log(logging.INFO, "Removing pairs that are not valid")
        total_combinations = len(combinations)
        if self.cores == 1 or total_combinations < self.min_batch_len:
            if total_combinations < self.min_batch_len:
                self.logger.debug(
                    "total combinations is less than min batch length({} < {})"
                    .format(total_combinations, self.min_batch_len))
            pbar = tqdm(total=total_combinations, file=sys.stdout)
            for combo in combinations:
                res = checkPair(combo)
                if res:
                    tag, res = res
                    if tag == 1:
                        same[res[0]] = res
                    else:
                        different[res[0]] = res
                pbar.update()
            pbar.close()
        else:
            # Put the pairs into batches so they can be used in parallel, otherwise the overhead is too much
            batches = chunks(combinations, self.batch_size)
            batch_count = total_combinations // self.batch_size
            tmp_same = []
            tmp_different = []
            possible_errors = 0
            if total_combinations % self.batch_size != 0:
                batch_count += 1
            self.logger.debug("{} total batches".format(batch_count))
            imap_results = []
            t0 = time.time()
            with mp.Pool(self.cores) as Pool:
                try:
                    imap_results = list(
                        tqdm(Pool.imap_unordered(self._batchCheckPair,
                                                 batches),
                             total=batch_count,
                             file=sys.stdout))
                except Exception as e:
                    print()
                    self.logger.exception(
                        "Exception raised when putting batches into pool",
                        exc_info=e)
                    raise e
            t1 = time.time()
            self.logger.debug("{:.2f} combos/second".format(
                total_combinations / (t1 - t0)))
            if not imap_results:
                printLogToConsole(self.console_log_level,
                                  "imap_results is empty", logging.ERROR)
                self.logger.log(logging.ERROR, "imap_results is empty")
                return [], []
            for s, d in imap_results:
                tmp_same.extend(s)
                tmp_different.extend(d)

                # tmp_same = list(set(tmp_same))
                # tmp_different = list(set(tmp_different))
            printLogToConsole(self.console_log_level,
                              "Combining results from pool", logging.INFO)
            self.logger.log(logging.INFO, "Combining results from pool")
            for i in tmp_same:
                if i[0] in same:
                    possible_errors += 1
                same[i[0]] = i
            for i in tmp_different:
                if i[0] in different:
                    possible_errors += 1
                different[i[0]] = i
            printLogToConsole(self.console_log_level,
                              "{} overlapping keys".format(possible_errors),
                              logging.DEBUG)
            self.logger.log(logging.DEBUG,
                            "{} overlapping keys".format(possible_errors))
        gc.collect()
        self.logger.log(
            logging.DEBUG,
            "Removed {} pairs".format(total_combinations -
                                      (len(same) + len(different))))
        return [v for _, v in same.items()], [v for _, v in different.items()]