Example #1
0
    def train(self, recall=0.95, index_predicates=True):  # pragma: no cover
        """
        Keyword arguments:

        maximum_comparisons -- The maximum number of comparisons a
                               blocking rule is allowed to make.

                               Defaults to 1000000

        recall -- The proportion of true dupe pairs in our training
                  data that that we the learned blocks must cover. If
                  we lower the recall, there will be pairs of true
                  dupes that we will never directly compare.

                  recall should be a float between 0.0 and 1.0, the default
                  is 0.95

        index_predicates -- Should dedupe consider predicates that
                            rely upon indexing the data. Index predicates can
                            be slower and take susbstantial memory.

                            Defaults to True.
        """
        examples, y = flatten_training(self.training_pairs)
        self.classifier.fit(self.data_model.distances(examples), y)

        self.predicates = self.active_learner.learn_predicates(
            recall, index_predicates)
        self.blocker = blocking.Blocker(self.predicates)
        self.blocker.resetIndices()
Example #2
0
    def __init__(self,
                 settings_file,
                 num_cores=None):  # pragma: no cover
        """
        Initialize from a settings file
        #### Example usage

            # initialize from a settings file
            with open('my_learned_settings', 'rb') as f:
                deduper = dedupe.StaticDedupe(f)

        #### Keyword arguments

        `settings_file`
        A file object containing settings data.


        Settings files are typically generated by saving the settings
        learned from ActiveMatching. If you need details for this
        file see the method [`writeSettings`][[api.py#writesettings]].
        """
        if num_cores is None:
            self.num_cores = multiprocessing.cpu_count()
        else:
            self.num_cores = num_cores

        try:
            self.data_model = pickle.load(settings_file)
            self.classifier = pickle.load(settings_file)
            self.predicates = pickle.load(settings_file)
        except (KeyError, AttributeError):
            raise SettingsFileLoadingException(
                "This settings file is not compatible with "
                "the current version of dedupe. This can happen "
                "if you have recently upgraded dedupe.")
        except:
            raise SettingsFileLoadingException(
                "Something has gone wrong with loading the settings file. "
                "Try deleting the file")

        self.loaded_indices = False

        try:
            self._loadIndices(settings_file)
        except EOFError:
            pass
        except (KeyError, AttributeError):
            raise SettingsFileLoadingException(
                "This settings file is not compatible with "
                "the current version of dedupe. This can happen "
                "if you have recently upgraded dedupe.")
        except:
            raise SettingsFileLoadingException(
                "Something has gone wrong with loading the settings file. "
                "Try deleting the file")

        logger.info(self.predicates)

        self.blocker = blocking.Blocker(self.predicates)
Example #3
0
    def _trainBlocker(self, recall, index_predicates):  # pragma: no cover
        matches = self.training_pairs['match'][:]

        predicate_set = self.data_model.predicates(index_predicates,
                                                   self.canopies)

        block_learner = self._blockLearner(predicate_set)

        self.predicates = block_learner.learn(matches, recall)

        self.blocker = blocking.Blocker(self.predicates)
Example #4
0
    def _trainBlocker(self,
                      ppc=1,
                      uncovered_dupes=1,
                      index_predicates=True):  # pragma : no cover
        training_pairs = copy.deepcopy(self.training_pairs)

        confident_nonduplicates = training.semiSupervisedNonDuplicates(
            self.data_sample, self.data_model, sample_size=32000)

        training_pairs[u'distinct'].extend(confident_nonduplicates)

        predicate_set = predicateGenerator(self.data_model, index_predicates)

        (self.predicates, self.stop_words) = dedupe.training.blockTraining(
            training_pairs, predicate_set, ppc, uncovered_dupes,
            self._linkage_type)

        self.blocker = blocking.Blocker(self.predicates, self.stop_words)
Example #5
0
    def _trainBlocker(self, ppc, uncovered_dupes,
                      index_predicates):  # pragma : no cover
        training_pairs = copy.deepcopy(self.training_pairs)

        confident_nonduplicates = training.semiSupervisedNonDuplicates(
            self.data_sample,
            self.data_model,
            self.classifier,
            sample_size=32000)

        training_pairs[u'distinct'].extend(confident_nonduplicates)

        predicate_set = self.data_model.predicates(index_predicates,
                                                   self.canopies)

        self.predicates = dedupe.training.blockTraining(
            training_pairs, predicate_set, ppc, uncovered_dupes,
            self._linkage_type)

        self.blocker = blocking.Blocker(self.predicates)
Example #6
0
    def blockingFunction(self,
                         constrained_matching=False,
                         ppc=1,
                         uncovered_dupes=1):
        """
        Returns a function that takes in a record dictionary and
        returns a list of blocking keys for the record. We will
        learn the best blocking predicates if we don't have them already.

        Keyword arguments:
        ppc -- Limits the Proportion of Pairs Covered that we allow a
               predicate to cover. If a predicate puts together a fraction
               of possible pairs greater than the ppc, that predicate will
               be removed from consideration.

               As the size of the data increases, the user will generally
               want to reduce ppc.

               ppc should be a value between 0.0 and 1.0

        uncovered_dupes -- The number of true dupes pairs in our training
                           data that we can accept will not be put into any
                           block. If true true duplicates are never in the
                           same block, we will never compare them, and may
                           never declare them to be duplicates.

                           However, requiring that we cover every single
                           true dupe pair may mean that we have to use
                           blocks that put together many, many distinct pairs
                           that we'll have to expensively, compare as well.

        """

        if not self.predicates:
            self.predicates = self._learnBlocking(ppc, uncovered_dupes,
                                                  constrained_matching)

        blocker = blocking.Blocker(self.predicates)

        return blocker