def train_linker(df1, df2): if READ_FROM_SETTINGS_FILE: with open(SETTINGS_FILE, 'rb') as sf: linker = dedupe.StaticRecordLink(sf) else: linker = dedupe.RecordLink(FIELDS) # It's terrible that you have to do this next line!!!! linker.classifier = rlr.RegularizedLogisticRegression() linker.sample(df1, df2, BLOCKING_TRAINING_SAMPLE_SIZE) if READ_FROM_TRAINING_FILE: print('reading labeled examples from ', TRAINING_FILE) with open(TRAINING_FILE, 'rb') as tf: linker.readTraining(tf) else: dedupe.consoleLabel(linker) linker.train() if WRITE_SETTINGS_FILE: with open(SETTINGS_FILE, 'wb') as sf: linker.writeSettings(sf) if WRITE_TRAINING_FILE: with open(TRAINING_FILE, 'w') as tf: linker.writeTraining(tf) return linker
class ActiveMatching(Matching): classifier = rlr.RegularizedLogisticRegression() ActiveLearner = labeler.RLRLearner """ Class for training dedupe extends Matching. Public methods: - __init__ - readTraining - train - writeSettings - writeTraining - uncertainPairs - markPairs - cleanupTraining """ def __init__(self, variable_definition, data_sample=None, num_cores=None): """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = [{'field' : 'Site name', 'type': 'String'}, {'field' : 'Address', 'type': 'String'}, {'field' : 'Zip', 'type': 'String', 'Has Missing':True}, {'field' : 'Phone', 'type': 'String', 'Has Missing':True}, ] deduper = dedupe.Dedupe(fields) #### Additional detail A field definition is a list of dictionaries where each dictionary describes a variable to use for comparing records. For details about variable types, check the documentation. <https://dedupe.io/developers/library>`_ """ self.data_model = datamodel.DataModel(variable_definition) if data_sample is not None: raise UserWarning( 'data_sample is deprecated, use the .sample method') if num_cores is None: self.num_cores = multiprocessing.cpu_count() else: self.num_cores = num_cores self.active_learner = None self.training_pairs = OrderedDict({u'distinct': [], u'match': []}) self.blocker = None self.loaded_indices = False def cleanupTraining(self): # pragma: no cover ''' Clean up data we used for training. Free up memory. ''' del self.training_pairs del self.active_learner def readTraining(self, training_file): ''' Read training from previously built training data file object Arguments: training_file -- file object containing the training data ''' logger.info('reading training from file') training_pairs = json.load(training_file, cls=serializer.dedupe_decoder) self.markPairs(training_pairs) def train(self, recall=0.95, index_predicates=True): # pragma: no cover """ Keyword arguments: maximum_comparisons -- The maximum number of comparisons a blocking rule is allowed to make. Defaults to 1000000 recall -- The proportion of true dupe pairs in our training data that that we the learned blocks must cover. If we lower the recall, there will be pairs of true dupes that we will never directly compare. recall should be a float between 0.0 and 1.0, the default is 0.95 index_predicates -- Should dedupe consider predicates that rely upon indexing the data. Index predicates can be slower and take susbstantial memory. Defaults to True. """ examples, y = flatten_training(self.training_pairs) self.classifier.fit(self.data_model.distances(examples), y) self._trainBlocker(recall, index_predicates) def _trainBlocker(self, recall, index_predicates): # pragma: no cover matches = self.training_pairs['match'][:] predicate_set = self.data_model.predicates(index_predicates, self.canopies) block_learner = self._blockLearner(predicate_set) self.predicates = block_learner.learn(matches, recall) self.blocker = blocking.Blocker(self.predicates) def writeTraining(self, file_obj): # pragma: no cover """ Write to a json file that contains labeled examples Keyword arguments: file_obj -- file object to write training data to """ json.dump(self.training_pairs, file_obj, default=serializer._to_json, tuple_as_array=False, ensure_ascii=True) def uncertainPairs(self): ''' Provides a list of the pairs of records that dedupe is most curious to learn if they are matches or distinct. Useful for user labeling. ''' return self.active_learner.get() def markPairs(self, labeled_pairs): ''' Argument : labeled_pairs -- A dictionary with two keys, `match` and `distinct` the values are lists that can contain pairs of records ''' self._checkTrainingPairs(labeled_pairs) for label, examples in labeled_pairs.items(): self.training_pairs[label].extend(examples) if self.active_learner: examples, y = flatten_training(labeled_pairs) self.active_learner.mark(examples, y) def _checkTrainingPairs(self, labeled_pairs): try: labeled_pairs.items() labeled_pairs[u'match'] labeled_pairs[u'distinct'] except: raise ValueError('labeled_pairs must be a dictionary with keys ' '"distinct" and "match"') if labeled_pairs[u'match']: pair = labeled_pairs[u'match'][0] self._checkRecordPair(pair) if labeled_pairs[u'distinct']: pair = labeled_pairs[u'distinct'][0] self._checkRecordPair(pair) if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match']: warnings.warn("Didn't return any labeled record pairs") def _checkRecordPair(self, record_pair): try: record_pair[0] except: raise ValueError("The elements of data_sample must be pairs " "of record_pairs (ordered sequences of length 2)") if len(record_pair) != 2: raise ValueError("The elements of data_sample must be pairs " "of record_pairs") try: record_pair[0].keys() and record_pair[1].keys() except: raise ValueError("A pair of record_pairs must be made up of two " "dictionaries ") self.data_model.check(record_pair[0]) self.data_model.check(record_pair[1])
class ActiveMatching(Matching): classifier = rlr.RegularizedLogisticRegression() """ Class for training dedupe extends Matching. Public methods: - __init__ - readTraining - train - writeSettings - writeTraining - uncertainPairs - markPairs - cleanupTraining """ def __init__(self, variable_definition, data_sample=None, num_cores=None): """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = [{'field' : 'Site name', 'type': 'String'}, {'field' : 'Address', 'type': 'String'}, {'field' : 'Zip', 'type': 'String', 'Has Missing':True}, {'field' : 'Phone', 'type': 'String', 'Has Missing':True}, ] data_sample = [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] deduper = dedupe.Dedupe(fields, data_sample) #### Additional detail A field definition is a list of dictionaries where each dictionary describes a variable to use for comparing records. For details about variable types, check the documentation. <http://dedupe.readthedocs.org>`_ In the data_sample, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. """ self.data_model = datamodel.DataModel(variable_definition) if num_cores is None: self.num_cores = multiprocessing.cpu_count() else: self.num_cores = num_cores if data_sample: self._checkDataSample(data_sample) self.data_sample = data_sample self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model, self.num_cores) else: self.data_sample = [] self.activeLearner = None # Override _loadSampledRecords() to load blocking data from # data_sample. self._loadSampledRecords(data_sample) training_dtype = [('label', 'S8'), ('distances', 'f4', (len(self.data_model), ))] self.training_data = numpy.zeros(0, dtype=training_dtype) self.training_pairs = OrderedDict({u'distinct': [], u'match': []}) self.blocker = None self.loaded_indices = False def cleanupTraining(self): # pragma: no cover ''' Clean up data we used for training. Free up memory. ''' del self.training_data del self.training_pairs del self.activeLearner del self.data_sample def readTraining(self, training_file): ''' Read training from previously built training data file object Arguments: training_file -- file object containing the training data ''' logger.info('reading training from file') training_pairs = json.load(training_file, cls=serializer.dedupe_decoder) if not any(training_pairs.values()): raise EmptyTrainingException( "The training file seems to contain no training examples") for (label, examples) in training_pairs.items(): if examples: self._checkRecordPairType(examples[0]) examples = core.freezeData(examples) training_pairs[label] = examples self.training_pairs[label].extend(examples) self._addTrainingData(training_pairs) self._trainClassifier() def train(self, ppc=None, uncovered_dupes=None, maximum_comparisons=1000000, recall=0.95, index_predicates=True): # pragma: no cover """Keyword arguments: maximum_comparisons -- The maximum number of comparisons a blocking rule is allowed to make. Defaults to 1000000 recall -- The proportion of true dupe pairs in our training data that that we the learned blocks must cover. If we lower the recall, there will be pairs of true dupes that we will never directly compare. recall should be a float between 0.0 and 1.0, the default is 0.975 index_predicates -- Should dedupe consider predicates that rely upon indexing the data. Index predicates can be slower and take susbstantial memory. Defaults to True. """ if ppc is not None: warnings.warn('`ppc` is a deprecated argument to train. ' 'Use `maximum_comparisons` to set the maximum ' 'number records a block is allowed to cover') if uncovered_dupes is not None: warnings.warn('`uncovered_dupes` is a deprecated argument ' 'to train. Use recall to set the proportion ' 'of true pairs that the blocking rules must cover') self._trainClassifier() self._trainBlocker(maximum_comparisons, recall, index_predicates) def _trainClassifier(self, **kwargs): # pragma: no cover labels = numpy.array(self.training_data['label'] == b'match', dtype='int8') examples = self.training_data['distances'] classifier_args = backport.signature(self.classifier.fit).parameters classifier_args = {k : kwargs[k] for k in viewkeys(kwargs) & classifier_args} self.classifier.fit(examples, labels, **classifier_args) def _trainBlocker(self, maximum_comparisons, recall, index_predicates): # pragma: no cover matches = self.training_pairs['match'][:] predicate_set = self.data_model.predicates(index_predicates, self.canopies) block_learner = self._blockLearner(predicate_set) self.predicates = block_learner.learn(matches, maximum_comparisons, recall) self.blocker = blocking.Blocker(self.predicates) def writeTraining(self, file_obj): # pragma: no cover """ Write to a json file that contains labeled examples Keyword arguments: file_obj -- file object to write training data to """ json.dump(self.training_pairs, file_obj, default=serializer._to_json, tuple_as_array=False, ensure_ascii=True) def uncertainPairs(self): ''' Provides a list of the pairs of records that dedupe is most curious to learn if they are matches or distinct. Useful for user labeling. ''' if self.training_data.shape[0] == 0: rand_int = random.randint(0, len(self.data_sample) - 1) random_pair = self.data_sample[rand_int] exact_match = (random_pair[0], random_pair[0]) self._addTrainingData({u'match': [exact_match, exact_match], u'distinct': [random_pair]}) self._trainClassifier(cv=0) bias = len(self.training_pairs[u'match']) if bias: bias /= (bias + len(self.training_pairs[u'distinct'])) min_examples = min(len(self.training_pairs[u'match']), len(self.training_pairs[u'distinct'])) regularizer = 10 bias = ((0.5 * min_examples + bias * regularizer) / (min_examples + regularizer)) return self.activeLearner.uncertainPairs(self.classifier, bias) def markPairs(self, labeled_pairs): ''' Add a labeled pairs of record to dedupes training set and update the matching model Argument : labeled_pairs -- A dictionary with two keys, `match` and `distinct` the values are lists that can contain pairs of records ''' try: labeled_pairs.items() labeled_pairs[u'match'] labeled_pairs[u'distinct'] except: raise ValueError('labeled_pairs must be a dictionary with keys ' '"distinct" and "match"') if labeled_pairs[u'match']: pair = labeled_pairs[u'match'][0] self._checkRecordPairType(pair) if labeled_pairs[u'distinct']: pair = labeled_pairs[u'distinct'][0] self._checkRecordPairType(pair) if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match']: warnings.warn("Didn't return any labeled record pairs") for label, pairs in labeled_pairs.items(): self.training_pairs[label].extend(core.freezeData(pairs)) self._addTrainingData(labeled_pairs) def _checkRecordPairType(self, record_pair): try: record_pair[0] except: raise ValueError("The elements of data_sample must be pairs " "of record_pairs (ordered sequences of length 2)") if len(record_pair) != 2: raise ValueError("The elements of data_sample must be pairs " "of record_pairs") try: record_pair[0].keys() and record_pair[1].keys() except: raise ValueError("A pair of record_pairs must be made up of two " "dictionaries ") self.data_model.check(record_pair[0]) self.data_model.check(record_pair[1]) def _checkDataSample(self, data_sample): try: len(data_sample) except TypeError: raise ValueError("data_sample must be a sequence") if len(data_sample): self._checkRecordPairType(data_sample[0]) else: warnings.warn("You submitted an empty data_sample") def _addTrainingData(self, labeled_pairs): """ Appends training data to the training data collection. """ for label, examples in labeled_pairs.items(): n_examples = len(examples) labels = [label] * n_examples new_data = numpy.empty(n_examples, dtype=self.training_data.dtype) new_data['label'] = labels new_data['distances'] = self.data_model.distances(examples) self.training_data = numpy.append(self.training_data, new_data) def _loadSample(self, data_sample): self._checkDataSample(data_sample) self.data_sample = data_sample self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model, self.num_cores) def _loadSampledRecords(self, data_sample): """Override to load blocking data from data_sample."""
class ActiveMatching(Matching): classifier = rlr.RegularizedLogisticRegression() """ Class for training dedupe extends Matching. Public methods: - __init__ - readTraining - train - writeSettings - writeTraining - uncertainPairs - markPairs - cleanupTraining """ def __init__(self, variable_definition, data_sample=None, num_cores=None): """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = [{'field' : 'Site name', 'type': 'String'}, {'field' : 'Address', 'type': 'String'}, {'field' : 'Zip', 'type': 'String', 'Has Missing':True}, {'field' : 'Phone', 'type': 'String', 'Has Missing':True}, ] data_sample = [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] deduper = dedupe.Dedupe(fields, data_sample) #### Additional detail A field definition is a list of dictionaries where each dictionary describes a variable to use for comparing records. For details about variable types, check the documentation. <http://dedupe.readthedocs.org>`_ In the data_sample, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. """ self.data_model = DataModel(variable_definition) if num_cores is None: self.num_cores = multiprocessing.cpu_count() else: self.num_cores = num_cores self.data_sample = data_sample if self.data_sample: self._checkDataSample(self.data_sample) self.activeLearner = training.ActiveLearning( self.data_sample, self.data_model, self.num_cores) else: self.data_sample = [] self.activeLearner = None training_dtype = [('label', 'S8'), ('distances', 'f4', (len(self.data_model), ))] self.training_data = numpy.zeros(0, dtype=training_dtype) self.training_pairs = OrderedDict({u'distinct': [], u'match': []}) self.blocker = None def cleanupTraining(self): # pragma : no cover ''' Clean up data we used for training. Free up memory. ''' del self.training_data del self.training_pairs del self.activeLearner del self.data_sample def readTraining(self, training_file): ''' Read training from previously built training data file object Arguments: training_file -- file object containing the training data ''' logger.info('reading training from file') training_pairs = json.load(training_file, cls=serializer.dedupe_decoder) if not any(training_pairs.values()): raise EmptyTrainingException( "The training file seems to contain no training examples") for (label, examples) in training_pairs.items(): if examples: self._checkRecordPairType(examples[0]) examples = core.freezeData(examples) training_pairs[label] = examples self.training_pairs[label].extend(examples) self._addTrainingData(training_pairs) self._trainClassifier() def train(self, ppc=.1, uncovered_dupes=None, index_predicates=True, pud=0.025): # pragma : no cover """Keyword arguments: ppc -- Limits the Proportion of Pairs Covered that we allow a predicate to cover. If a predicate puts together a fraction of possible pairs greater than the ppc, that predicate will be removed from consideration. As the size of the data increases, the user will generally want to reduce ppc. ppc should be a value between 0.0 and 1.0 uncovered_dupes -- The number of true dupes pairs in our training data that we can accept will not be put into any block. If true true duplicates are never in the same block, we will never compare them, and may never declare them to be duplicates. However, requiring that we cover every single true dupe pair may mean that we have to use blocks that put together many, many distinct pairs that we'll have to expensively, compare as well. uncoverd_dupes is deprecated in favor of pud pud -- The proportion of true dupe pairs in our training data that that we can accept will not be put into any block. If true true duplicates are never in the same block, we will never compare them, and may never declare them to be duplicates. If both pud and uncovered_dupes are set, uncovered_dupes will take priority pud should be a float between 0.0 and 1.0 index_predicates -- Should dedupe consider predicates that rely upon indexing the data. Index predicates can be slower and take susbstantial memory. Defaults to True. """ if uncovered_dupes is None: uncovered_dupes = int(pud * len(self.training_pairs['match'])) else: warnings.warn( "The uncovered_dupes argument of the train method will be deprecated in dedupe 1.4, please use the pud argument instead", DeprecationWarning) self._trainClassifier() self._trainBlocker(ppc, uncovered_dupes, index_predicates) def _trainClassifier(self): # pragma : no cover labels = numpy.array(self.training_data['label'] == b'match', dtype='i4') examples = self.training_data['distances'] self.classifier.fit(examples, labels) def _trainBlocker(self, ppc, uncovered_dupes, index_predicates): # pragma : no cover training_pairs = copy.deepcopy(self.training_pairs) confident_nonduplicates = training.semiSupervisedNonDuplicates( self.data_sample, self.data_model, self.classifier, sample_size=32000) training_pairs[u'distinct'].extend(confident_nonduplicates) predicate_set = self.data_model.predicates(index_predicates, self.canopies) self.predicates = dedupe.training.blockTraining( training_pairs, predicate_set, ppc, uncovered_dupes, self._linkage_type) self.blocker = blocking.Blocker(self.predicates) def writeSettings(self, file_obj): # pragma : no cover """ Write a settings file containing the data model and predicates to a file object Keyword arguments: file_obj -- file object to write settings data into """ pickle.dump(self.data_model, file_obj) pickle.dump(self.classifier, file_obj) pickle.dump(self.predicates, file_obj) def writeTraining(self, file_obj): # pragma : no cover """ Write to a json file that contains labeled examples Keyword arguments: file_obj -- file object to write training data to """ json.dump(self.training_pairs, file_obj, default=serializer._to_json, tuple_as_array=False, ensure_ascii=True) def uncertainPairs(self): ''' Provides a list of the pairs of records that dedupe is most curious to learn if they are matches or distinct. Useful for user labeling. ''' if self.training_data.shape[0] == 0: rand_int = random.randint(0, len(self.data_sample) - 1) random_pair = self.data_sample[rand_int] exact_match = (random_pair[0], random_pair[0]) self._addTrainingData({ u'match': [exact_match, exact_match], u'distinct': [random_pair] }) self._trainClassifier() bias = len(self.training_pairs[u'match']) if bias: bias /= (bias + len(self.training_pairs[u'distinct'])) min_examples = min(len(self.training_pairs[u'match']), len(self.training_pairs[u'distinct'])) regularizer = 10 bias = ((0.5 * min_examples + bias * regularizer) / (min_examples + regularizer)) return self.activeLearner.uncertainPairs(self.classifier, bias) def markPairs(self, labeled_pairs): ''' Add a labeled pairs of record to dedupes training set and update the matching model Argument : labeled_pairs -- A dictionary with two keys, `match` and `distinct` the values are lists that can contain pairs of records ''' try: labeled_pairs.items() labeled_pairs[u'match'] labeled_pairs[u'distinct'] except: raise ValueError('labeled_pairs must be a dictionary with keys ' '"distinct" and "match"') if labeled_pairs[u'match']: pair = labeled_pairs[u'match'][0] self._checkRecordPairType(pair) if labeled_pairs[u'distinct']: pair = labeled_pairs[u'distinct'][0] self._checkRecordPairType(pair) if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match']: warnings.warn("Didn't return any labeled record pairs") for label, pairs in labeled_pairs.items(): self.training_pairs[label].extend(core.freezeData(pairs)) self._addTrainingData(labeled_pairs) def _checkRecordPairType(self, record_pair): try: record_pair[0] except: raise ValueError("The elements of data_sample must be pairs " "of record_pairs (ordered sequences of length 2)") if len(record_pair) != 2: raise ValueError("The elements of data_sample must be pairs " "of record_pairs") try: record_pair[0].keys() and record_pair[1].keys() except: raise ValueError("A pair of record_pairs must be made up of two " "dictionaries ") self.data_model.check(record_pair[0]) self.data_model.check(record_pair[1]) def _checkDataSample(self, data_sample): try: len(data_sample) except TypeError: raise ValueError("data_sample must be a sequence") if len(data_sample): self._checkRecordPairType(data_sample[0]) else: warnings.warn("You submitted an empty data_sample") def _addTrainingData(self, labeled_pairs): """ Appends training data to the training data collection. """ for label, examples in labeled_pairs.items(): n_examples = len(examples) labels = [label] * n_examples new_data = numpy.empty(n_examples, dtype=self.training_data.dtype) new_data['label'] = labels new_data['distances'] = self.data_model.distances(examples) self.training_data = numpy.append(self.training_data, new_data) def _loadSample(self, data_sample): self._checkDataSample(data_sample) self.data_sample = data_sample self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model, self.num_cores)