def train(self, recall=0.95, index_predicates=True): # pragma: no cover """ Keyword arguments: maximum_comparisons -- The maximum number of comparisons a blocking rule is allowed to make. Defaults to 1000000 recall -- The proportion of true dupe pairs in our training data that that we the learned blocks must cover. If we lower the recall, there will be pairs of true dupes that we will never directly compare. recall should be a float between 0.0 and 1.0, the default is 0.95 index_predicates -- Should dedupe consider predicates that rely upon indexing the data. Index predicates can be slower and take susbstantial memory. Defaults to True. """ examples, y = flatten_training(self.training_pairs) self.classifier.fit(self.data_model.distances(examples), y) self.predicates = self.active_learner.learn_predicates( recall, index_predicates) self.blocker = blocking.Blocker(self.predicates) self.blocker.resetIndices()
def __init__(self, settings_file, num_cores=None): # pragma: no cover """ Initialize from a settings file #### Example usage # initialize from a settings file with open('my_learned_settings', 'rb') as f: deduper = dedupe.StaticDedupe(f) #### Keyword arguments `settings_file` A file object containing settings data. Settings files are typically generated by saving the settings learned from ActiveMatching. If you need details for this file see the method [`writeSettings`][[api.py#writesettings]]. """ if num_cores is None: self.num_cores = multiprocessing.cpu_count() else: self.num_cores = num_cores try: self.data_model = pickle.load(settings_file) self.classifier = pickle.load(settings_file) self.predicates = pickle.load(settings_file) except (KeyError, AttributeError): raise SettingsFileLoadingException( "This settings file is not compatible with " "the current version of dedupe. This can happen " "if you have recently upgraded dedupe.") except: raise SettingsFileLoadingException( "Something has gone wrong with loading the settings file. " "Try deleting the file") self.loaded_indices = False try: self._loadIndices(settings_file) except EOFError: pass except (KeyError, AttributeError): raise SettingsFileLoadingException( "This settings file is not compatible with " "the current version of dedupe. This can happen " "if you have recently upgraded dedupe.") except: raise SettingsFileLoadingException( "Something has gone wrong with loading the settings file. " "Try deleting the file") logger.info(self.predicates) self.blocker = blocking.Blocker(self.predicates)
def _trainBlocker(self, recall, index_predicates): # pragma: no cover matches = self.training_pairs['match'][:] predicate_set = self.data_model.predicates(index_predicates, self.canopies) block_learner = self._blockLearner(predicate_set) self.predicates = block_learner.learn(matches, recall) self.blocker = blocking.Blocker(self.predicates)
def _trainBlocker(self, ppc=1, uncovered_dupes=1, index_predicates=True): # pragma : no cover training_pairs = copy.deepcopy(self.training_pairs) confident_nonduplicates = training.semiSupervisedNonDuplicates( self.data_sample, self.data_model, sample_size=32000) training_pairs[u'distinct'].extend(confident_nonduplicates) predicate_set = predicateGenerator(self.data_model, index_predicates) (self.predicates, self.stop_words) = dedupe.training.blockTraining( training_pairs, predicate_set, ppc, uncovered_dupes, self._linkage_type) self.blocker = blocking.Blocker(self.predicates, self.stop_words)
def _trainBlocker(self, ppc, uncovered_dupes, index_predicates): # pragma : no cover training_pairs = copy.deepcopy(self.training_pairs) confident_nonduplicates = training.semiSupervisedNonDuplicates( self.data_sample, self.data_model, self.classifier, sample_size=32000) training_pairs[u'distinct'].extend(confident_nonduplicates) predicate_set = self.data_model.predicates(index_predicates, self.canopies) self.predicates = dedupe.training.blockTraining( training_pairs, predicate_set, ppc, uncovered_dupes, self._linkage_type) self.blocker = blocking.Blocker(self.predicates)
def blockingFunction(self, constrained_matching=False, ppc=1, uncovered_dupes=1): """ Returns a function that takes in a record dictionary and returns a list of blocking keys for the record. We will learn the best blocking predicates if we don't have them already. Keyword arguments: ppc -- Limits the Proportion of Pairs Covered that we allow a predicate to cover. If a predicate puts together a fraction of possible pairs greater than the ppc, that predicate will be removed from consideration. As the size of the data increases, the user will generally want to reduce ppc. ppc should be a value between 0.0 and 1.0 uncovered_dupes -- The number of true dupes pairs in our training data that we can accept will not be put into any block. If true true duplicates are never in the same block, we will never compare them, and may never declare them to be duplicates. However, requiring that we cover every single true dupe pair may mean that we have to use blocks that put together many, many distinct pairs that we'll have to expensively, compare as well. """ if not self.predicates: self.predicates = self._learnBlocking(ppc, uncovered_dupes, constrained_matching) blocker = blocking.Blocker(self.predicates) return blocker