def __init__(self, variable_definition, data_sample=None, num_cores=None): """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = [{'field' : 'Site name', 'type': 'String'}, {'field' : 'Address', 'type': 'String'}, {'field' : 'Zip', 'type': 'String', 'Has Missing':True}, {'field' : 'Phone', 'type': 'String', 'Has Missing':True}, ] data_sample = [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] deduper = dedupe.Dedupe(fields, data_sample) #### Additional detail A field definition is a list of dictionaries where each dictionary describes a variable to use for comparing records. For details about variable types, check the documentation. <http://dedupe.readthedocs.org>`_ In the data_sample, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. """ self.data_model = DataModel(variable_definition) if num_cores is None: self.num_cores = multiprocessing.cpu_count() else: self.num_cores = num_cores self.data_sample = data_sample if self.data_sample: self._checkDataSample(self.data_sample) self.activeLearner = training.ActiveLearning( self.data_sample, self.data_model, self.num_cores) else: self.data_sample = [] self.activeLearner = None training_dtype = [('label', 'S8'), ('distances', 'f4', (len(self.data_model), ))] self.training_data = numpy.zeros(0, dtype=training_dtype) self.training_pairs = OrderedDict({u'distinct': [], u'match': []}) self.blocker = None
class ActiveMatching(Matching): classifier = rlr.RegularizedLogisticRegression() """ Class for training dedupe extends Matching. Public methods: - __init__ - readTraining - train - writeSettings - writeTraining - uncertainPairs - markPairs - cleanupTraining """ def __init__(self, variable_definition, data_sample=None, num_cores=None): """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = [{'field' : 'Site name', 'type': 'String'}, {'field' : 'Address', 'type': 'String'}, {'field' : 'Zip', 'type': 'String', 'Has Missing':True}, {'field' : 'Phone', 'type': 'String', 'Has Missing':True}, ] data_sample = [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] deduper = dedupe.Dedupe(fields, data_sample) #### Additional detail A field definition is a list of dictionaries where each dictionary describes a variable to use for comparing records. For details about variable types, check the documentation. <http://dedupe.readthedocs.org>`_ In the data_sample, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. """ self.data_model = DataModel(variable_definition) if num_cores is None: self.num_cores = multiprocessing.cpu_count() else: self.num_cores = num_cores self.data_sample = data_sample if self.data_sample: self._checkDataSample(self.data_sample) self.activeLearner = training.ActiveLearning( self.data_sample, self.data_model, self.num_cores) else: self.data_sample = [] self.activeLearner = None training_dtype = [('label', 'S8'), ('distances', 'f4', (len(self.data_model), ))] self.training_data = numpy.zeros(0, dtype=training_dtype) self.training_pairs = OrderedDict({u'distinct': [], u'match': []}) self.blocker = None def cleanupTraining(self): # pragma : no cover ''' Clean up data we used for training. Free up memory. ''' del self.training_data del self.training_pairs del self.activeLearner del self.data_sample def readTraining(self, training_file): ''' Read training from previously built training data file object Arguments: training_file -- file object containing the training data ''' logger.info('reading training from file') training_pairs = json.load(training_file, cls=serializer.dedupe_decoder) if not any(training_pairs.values()): raise EmptyTrainingException( "The training file seems to contain no training examples") for (label, examples) in training_pairs.items(): if examples: self._checkRecordPairType(examples[0]) examples = core.freezeData(examples) training_pairs[label] = examples self.training_pairs[label].extend(examples) self._addTrainingData(training_pairs) self._trainClassifier() def train(self, ppc=.1, uncovered_dupes=None, index_predicates=True, pud=0.025): # pragma : no cover """Keyword arguments: ppc -- Limits the Proportion of Pairs Covered that we allow a predicate to cover. If a predicate puts together a fraction of possible pairs greater than the ppc, that predicate will be removed from consideration. As the size of the data increases, the user will generally want to reduce ppc. ppc should be a value between 0.0 and 1.0 uncovered_dupes -- The number of true dupes pairs in our training data that we can accept will not be put into any block. If true true duplicates are never in the same block, we will never compare them, and may never declare them to be duplicates. However, requiring that we cover every single true dupe pair may mean that we have to use blocks that put together many, many distinct pairs that we'll have to expensively, compare as well. uncoverd_dupes is deprecated in favor of pud pud -- The proportion of true dupe pairs in our training data that that we can accept will not be put into any block. If true true duplicates are never in the same block, we will never compare them, and may never declare them to be duplicates. If both pud and uncovered_dupes are set, uncovered_dupes will take priority pud should be a float between 0.0 and 1.0 index_predicates -- Should dedupe consider predicates that rely upon indexing the data. Index predicates can be slower and take susbstantial memory. Defaults to True. """ if uncovered_dupes is None: uncovered_dupes = int(pud * len(self.training_pairs['match'])) else: warnings.warn( "The uncovered_dupes argument of the train method will be deprecated in dedupe 1.4, please use the pud argument instead", DeprecationWarning) self._trainClassifier() self._trainBlocker(ppc, uncovered_dupes, index_predicates) def _trainClassifier(self): # pragma : no cover labels = numpy.array(self.training_data['label'] == b'match', dtype='i4') examples = self.training_data['distances'] self.classifier.fit(examples, labels) def _trainBlocker(self, ppc, uncovered_dupes, index_predicates): # pragma : no cover training_pairs = copy.deepcopy(self.training_pairs) confident_nonduplicates = training.semiSupervisedNonDuplicates( self.data_sample, self.data_model, self.classifier, sample_size=32000) training_pairs[u'distinct'].extend(confident_nonduplicates) predicate_set = self.data_model.predicates(index_predicates, self.canopies) self.predicates = dedupe.training.blockTraining( training_pairs, predicate_set, ppc, uncovered_dupes, self._linkage_type) self.blocker = blocking.Blocker(self.predicates) def writeSettings(self, file_obj): # pragma : no cover """ Write a settings file containing the data model and predicates to a file object Keyword arguments: file_obj -- file object to write settings data into """ pickle.dump(self.data_model, file_obj) pickle.dump(self.classifier, file_obj) pickle.dump(self.predicates, file_obj) def writeTraining(self, file_obj): # pragma : no cover """ Write to a json file that contains labeled examples Keyword arguments: file_obj -- file object to write training data to """ json.dump(self.training_pairs, file_obj, default=serializer._to_json, tuple_as_array=False, ensure_ascii=True) def uncertainPairs(self): ''' Provides a list of the pairs of records that dedupe is most curious to learn if they are matches or distinct. Useful for user labeling. ''' if self.training_data.shape[0] == 0: rand_int = random.randint(0, len(self.data_sample) - 1) random_pair = self.data_sample[rand_int] exact_match = (random_pair[0], random_pair[0]) self._addTrainingData({ u'match': [exact_match, exact_match], u'distinct': [random_pair] }) self._trainClassifier() bias = len(self.training_pairs[u'match']) if bias: bias /= (bias + len(self.training_pairs[u'distinct'])) min_examples = min(len(self.training_pairs[u'match']), len(self.training_pairs[u'distinct'])) regularizer = 10 bias = ((0.5 * min_examples + bias * regularizer) / (min_examples + regularizer)) return self.activeLearner.uncertainPairs(self.classifier, bias) def markPairs(self, labeled_pairs): ''' Add a labeled pairs of record to dedupes training set and update the matching model Argument : labeled_pairs -- A dictionary with two keys, `match` and `distinct` the values are lists that can contain pairs of records ''' try: labeled_pairs.items() labeled_pairs[u'match'] labeled_pairs[u'distinct'] except: raise ValueError('labeled_pairs must be a dictionary with keys ' '"distinct" and "match"') if labeled_pairs[u'match']: pair = labeled_pairs[u'match'][0] self._checkRecordPairType(pair) if labeled_pairs[u'distinct']: pair = labeled_pairs[u'distinct'][0] self._checkRecordPairType(pair) if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match']: warnings.warn("Didn't return any labeled record pairs") for label, pairs in labeled_pairs.items(): self.training_pairs[label].extend(core.freezeData(pairs)) self._addTrainingData(labeled_pairs) def _checkRecordPairType(self, record_pair): try: record_pair[0] except: raise ValueError("The elements of data_sample must be pairs " "of record_pairs (ordered sequences of length 2)") if len(record_pair) != 2: raise ValueError("The elements of data_sample must be pairs " "of record_pairs") try: record_pair[0].keys() and record_pair[1].keys() except: raise ValueError("A pair of record_pairs must be made up of two " "dictionaries ") self.data_model.check(record_pair[0]) self.data_model.check(record_pair[1]) def _checkDataSample(self, data_sample): try: len(data_sample) except TypeError: raise ValueError("data_sample must be a sequence") if len(data_sample): self._checkRecordPairType(data_sample[0]) else: warnings.warn("You submitted an empty data_sample") def _addTrainingData(self, labeled_pairs): """ Appends training data to the training data collection. """ for label, examples in labeled_pairs.items(): n_examples = len(examples) labels = [label] * n_examples new_data = numpy.empty(n_examples, dtype=self.training_data.dtype) new_data['label'] = labels new_data['distances'] = self.data_model.distances(examples) self.training_data = numpy.append(self.training_data, new_data) def _loadSample(self, data_sample): self._checkDataSample(data_sample) self.data_sample = data_sample self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model, self.num_cores)
def __init__(self, field_definition, data_sample = None, num_processes = 1) : """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = {'Site name': {'type': 'String'}, 'Address': {'type': 'String'}, 'Zip': {'type': 'String', 'Has Missing':True}, 'Phone': {'type': 'String', 'Has Missing':True}, } data_sample = [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] deduper = dedupe.Dedupe(fields, data_sample) #### Additional detail A field definition is a dictionary where the keys are the fields that will be used for training a model and the values are the field specification Field types include - String A 'String' type field must have as its key a name of a field as it appears in the data dictionary and a type declaration ex. `{'Phone': {type: 'String'}}` Longer example of a field definition: fields = {'name': {'type': 'String'}, 'address': {'type': 'String'}, 'city': {'type': 'String'}, 'cuisine': {'type': 'String'} } In the data_sample, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. """ super(ActiveMatching, self).__init__() if field_definition.__class__ is not dict : raise ValueError('Incorrect Input Type: must supply ' 'a field definition.') self.data_model = DataModel(field_definition) self.data_sample = data_sample if self.data_sample : self._checkDataSample(self.data_sample) self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model) else : self.activeLearner = None self.num_processes = num_processes training_dtype = [('label', 'S8'), ('distances', 'f4', (len(self.data_model['fields']), ))] self.training_data = numpy.zeros(0, dtype=training_dtype) self.training_pairs = dedupe.backport.OrderedDict({'distinct': [], 'match': []})
class ActiveMatching(Matching) : """ Class for training dedupe extends Matching. Public methods: - __init__ - readTraining - train - writeSettings - writeTraining - uncertainPairs - markPairs """ def __init__(self, field_definition, data_sample = None, num_processes = 1) : """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = {'Site name': {'type': 'String'}, 'Address': {'type': 'String'}, 'Zip': {'type': 'String', 'Has Missing':True}, 'Phone': {'type': 'String', 'Has Missing':True}, } data_sample = [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] deduper = dedupe.Dedupe(fields, data_sample) #### Additional detail A field definition is a dictionary where the keys are the fields that will be used for training a model and the values are the field specification Field types include - String A 'String' type field must have as its key a name of a field as it appears in the data dictionary and a type declaration ex. `{'Phone': {type: 'String'}}` Longer example of a field definition: fields = {'name': {'type': 'String'}, 'address': {'type': 'String'}, 'city': {'type': 'String'}, 'cuisine': {'type': 'String'} } In the data_sample, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. """ super(ActiveMatching, self).__init__() if field_definition.__class__ is not dict : raise ValueError('Incorrect Input Type: must supply ' 'a field definition.') self.data_model = DataModel(field_definition) self.data_sample = data_sample if self.data_sample : self._checkDataSample(self.data_sample) self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model) else : self.activeLearner = None self.num_processes = num_processes training_dtype = [('label', 'S8'), ('distances', 'f4', (len(self.data_model['fields']), ))] self.training_data = numpy.zeros(0, dtype=training_dtype) self.training_pairs = dedupe.backport.OrderedDict({'distinct': [], 'match': []}) def readTraining(self, training_source) : # pragma : no cover ''' Read training from previously saved training data file Arguments: training_source -- the path of the training data file ''' LOGGER.info('reading training from file') with open(training_source, 'r') as f: training_pairs = json.load(f, cls=serializer.dedupe_decoder) for (label, examples) in training_pairs.items(): if examples : self._checkRecordPairType(examples[0]) examples = core.freezeData(examples) training_pairs[label] = examples self.training_pairs[label].extend(examples) self._addTrainingData(training_pairs) self._trainClassifier() def train(self, ppc=1, uncovered_dupes=1) : """ Keyword arguments: ppc -- Limits the Proportion of Pairs Covered that we allow a predicate to cover. If a predicate puts together a fraction of possible pairs greater than the ppc, that predicate will be removed from consideration. As the size of the data increases, the user will generally want to reduce ppc. ppc should be a value between 0.0 and 1.0 uncovered_dupes -- The number of true dupes pairs in our training data that we can accept will not be put into any block. If true true duplicates are never in the same block, we will never compare them, and may never declare them to be duplicates. However, requiring that we cover every single true dupe pair may mean that we have to use blocks that put together many, many distinct pairs that we'll have to expensively, compare as well. """ n_folds = min(numpy.sum(self.training_data['label']=='match')/3, 20) n_folds = max(n_folds, 2) LOGGER.info('%d folds', n_folds) alpha = crossvalidation.gridSearch(self.training_data, core.trainModel, self.data_model, k=n_folds) self._trainClassifier(alpha) self._trainBlocker(ppc, uncovered_dupes) def _trainClassifier(self, alpha=.1) : # pragma : no cover self.data_model = core.trainModel(self.training_data, self.data_model, alpha) self._logLearnedWeights() def _trainBlocker(self, ppc=1, uncovered_dupes=1) : training_pairs = copy.deepcopy(self.training_pairs) blocker_types = self._blockerTypes() confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample, self.data_model, sample_size=32000) training_pairs['distinct'].extend(confident_nonduplicates) predicate_set = blocking.predicateGenerator(blocker_types, self.data_model) (self.predicates, self.stop_words) = dedupe.blocking.blockTraining(training_pairs, predicate_set, ppc, uncovered_dupes, self._linkage_type) self.blocker = self._Blocker(self.predicates, self.stop_words) def _blockerTypes(self) : # pragma : no cover string_predicates = (predicates.wholeFieldPredicate, predicates.tokenFieldPredicate, predicates.commonIntegerPredicate, predicates.sameThreeCharStartPredicate, predicates.sameFiveCharStartPredicate, predicates.sameSevenCharStartPredicate, predicates.nearIntegersPredicate, predicates.commonFourGram, predicates.commonSixGram) tfidf_string_predicates = tuple([tfidf.TfidfPredicate(threshold) for threshold in [0.2, 0.4, 0.6, 0.8]]) return {'String' : (string_predicates + tfidf_string_predicates)} def writeSettings(self, file_name): # pragma : no cover """ Write a settings file that contains the data model and predicates Keyword arguments: file_name -- path to file """ with open(file_name, 'w') as f: pickle.dump(self.data_model, f) pickle.dump(self.predicates, f) pickle.dump(self.stop_words, f) def writeTraining(self, file_name): # pragma : no cover """ Write to a json file that contains labeled examples Keyword arguments: file_name -- path to a json file """ with open(file_name, 'wb') as f: json.dump(self.training_pairs, f, default=serializer._to_json) def uncertainPairs(self) : ''' Provides a list of the pairs of records that dedupe is most curious to learn if they are matches or distinct. Useful for user labeling. ''' if self.training_data.shape[0] == 0 : rand_int = random.randint(0, len(self.data_sample)) random_pair = self.data_sample[rand_int] exact_match = (random_pair[0], random_pair[0]) self._addTrainingData({'match':[exact_match, exact_match], 'distinct':[]}) self._trainClassifier(alpha=0.1) dupe_ratio = (len(self.training_pairs['match']) /(len(self.training_pairs['distinct']) + 1.0)) return self.activeLearner.uncertainPairs(self.data_model, dupe_ratio) def markPairs(self, labeled_pairs) : ''' Add a labeled pairs of record to dedupes training set and update the matching model Argument : labeled_pairs -- A dictionary with two keys, `match` and `distinct` the values are lists that can contain pairs of records ''' try : labeled_pairs.items() labeled_pairs['match'] labeled_pairs['distinct'] except : raise ValueError('labeled_pairs must be a dictionary with keys ' '"distinct" and "match"') if labeled_pairs['match'] : pair = labeled_pairs['match'][0] self._checkRecordPairType(pair) if labeled_pairs['distinct'] : pair = labeled_pairs['distinct'][0] self._checkRecordPairType(pair) if not labeled_pairs['distinct'] and not labeled_pairs['match'] : warnings.warn("Didn't return any labeled record pairs") for label, pairs in labeled_pairs.items() : self.training_pairs[label].extend(core.freezeData(pairs)) self._addTrainingData(labeled_pairs) self._trainClassifier(alpha=.1) def _checkRecordPairType(self, record_pair) : try : record_pair[0] except : raise ValueError("The elements of data_sample must be pairs " "of record_pairs (ordered sequences of length 2)") if len(record_pair) != 2 : raise ValueError("The elements of data_sample must be pairs " "of record_pairs") try : record_pair[0].keys() and record_pair[1].keys() except : raise ValueError("A pair of record_pairs must be made up of two " "dictionaries ") self._checkRecordType(record_pair[0]) self._checkRecordType(record_pair[1]) def _checkDataSample(self, data_sample) : try : len(data_sample) except TypeError : raise ValueError("data_sample must be a sequence") if len(data_sample) : self._checkRecordPairType(data_sample[0]) else : warnings.warn("You submitted an empty data_sample") def _addTrainingData(self, labeled_pairs) : """ Appends training data to the training data collection. """ for label, examples in labeled_pairs.items () : n_examples = len(examples) labels = [label] * n_examples new_data = numpy.empty(n_examples, dtype=self.training_data.dtype) new_data['label'] = labels new_data['distances'] = core.fieldDistances(examples, self.data_model) self.training_data = numpy.append(self.training_data, new_data) def _logLearnedWeights(self): # pragma: no cover """ Log learned weights and bias terms """ LOGGER.info('Learned Weights') for (key_1, value_1) in self.data_model.items(): try: for (key_2, value_2) in value_1.items(): LOGGER.info((key_2, value_2['weight'])) except AttributeError: LOGGER.info((key_1, value_1)) def _loadSample(self, *args, **kwargs) : # pragma : no cover data_sample = self._sample(*args, **kwargs) self._checkDataSample(data_sample) self.data_sample = data_sample self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model)
def __init__(self, variable_definition, data_sample = None, num_cores = None) : """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = [{'field' : 'Site name', 'type': 'String'}, {'field' : 'Address', 'type': 'String'}, {'field' : 'Zip', 'type': 'String', 'Has Missing':True}, {'field' : 'Phone', 'type': 'String', 'Has Missing':True}, ] data_sample = [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] deduper = dedupe.Dedupe(fields, data_sample) #### Additional detail A field definition is a list of dictionaries where each dictionary describes a variable to use for comparing records. For details about variable types, check the documentation. <http://dedupe.readthedocs.org>`_ In the data_sample, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. """ self.data_model = DataModel(variable_definition) if num_cores is None : self.num_cores = multiprocessing.cpu_count() else : self.num_cores = num_cores self.data_sample = data_sample if self.data_sample : self._checkDataSample(self.data_sample) self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model, self.num_cores) else : self.data_sample = [] self.activeLearner = None training_dtype = [('label', 'S8'), ('distances', 'f4', (len(self.data_model), ))] self.training_data = numpy.zeros(0, dtype=training_dtype) self.training_pairs = OrderedDict({u'distinct': [], u'match': []}) self.blocker = None
class ActiveMatching(Matching) : classifier = rlr.RegularizedLogisticRegression() """ Class for training dedupe extends Matching. Public methods: - __init__ - readTraining - train - writeSettings - writeTraining - uncertainPairs - markPairs - cleanupTraining """ def __init__(self, variable_definition, data_sample = None, num_cores = None) : """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = [{'field' : 'Site name', 'type': 'String'}, {'field' : 'Address', 'type': 'String'}, {'field' : 'Zip', 'type': 'String', 'Has Missing':True}, {'field' : 'Phone', 'type': 'String', 'Has Missing':True}, ] data_sample = [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] deduper = dedupe.Dedupe(fields, data_sample) #### Additional detail A field definition is a list of dictionaries where each dictionary describes a variable to use for comparing records. For details about variable types, check the documentation. <http://dedupe.readthedocs.org>`_ In the data_sample, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. """ self.data_model = DataModel(variable_definition) if num_cores is None : self.num_cores = multiprocessing.cpu_count() else : self.num_cores = num_cores self.data_sample = data_sample if self.data_sample : self._checkDataSample(self.data_sample) self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model, self.num_cores) else : self.data_sample = [] self.activeLearner = None training_dtype = [('label', 'S8'), ('distances', 'f4', (len(self.data_model), ))] self.training_data = numpy.zeros(0, dtype=training_dtype) self.training_pairs = OrderedDict({u'distinct': [], u'match': []}) self.blocker = None def cleanupTraining(self) : # pragma : no cover ''' Clean up data we used for training. Free up memory. ''' del self.training_data del self.training_pairs del self.activeLearner del self.data_sample def readTraining(self, training_file) : ''' Read training from previously built training data file object Arguments: training_file -- file object containing the training data ''' logger.info('reading training from file') training_pairs = json.load(training_file, cls=serializer.dedupe_decoder) if not any(training_pairs.values()) : raise EmptyTrainingException("The training file seems to contain no training examples") for (label, examples) in training_pairs.items(): if examples : self._checkRecordPairType(examples[0]) examples = core.freezeData(examples) training_pairs[label] = examples self.training_pairs[label].extend(examples) self._addTrainingData(training_pairs) self._trainClassifier() def train(self, ppc=.1, uncovered_dupes=1, index_predicates=True) : # pragma : no cover """Keyword arguments: ppc -- Limits the Proportion of Pairs Covered that we allow a predicate to cover. If a predicate puts together a fraction of possible pairs greater than the ppc, that predicate will be removed from consideration. As the size of the data increases, the user will generally want to reduce ppc. ppc should be a value between 0.0 and 1.0 uncovered_dupes -- The number of true dupes pairs in our training data that we can accept will not be put into any block. If true true duplicates are never in the same block, we will never compare them, and may never declare them to be duplicates. However, requiring that we cover every single true dupe pair may mean that we have to use blocks that put together many, many distinct pairs that we'll have to expensively, compare as well. index_predicates -- Should dedupe consider predicates that rely upon indexing the data. Index predicates can be slower and take susbstantial memory. Defaults to True. """ self._trainClassifier() self._trainBlocker(ppc, uncovered_dupes, index_predicates) def _trainClassifier(self) : # pragma : no cover labels = numpy.array(self.training_data['label'] == b'match', dtype='i4') examples = self.training_data['distances'] self.classifier.fit(examples, labels) def _trainBlocker(self, ppc=1, uncovered_dupes=1, index_predicates=True) : # pragma : no cover training_pairs = copy.deepcopy(self.training_pairs) confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample, self.data_model, self.classifier, sample_size=32000) training_pairs[u'distinct'].extend(confident_nonduplicates) predicate_set = self.data_model.predicates(index_predicates, self.canopies) (self.predicates, self.stop_words) = dedupe.training.blockTraining(training_pairs, predicate_set, ppc, uncovered_dupes, self._linkage_type) self.blocker = blocking.Blocker(self.predicates, self.stop_words) def writeSettings(self, file_obj): # pragma : no cover """ Write a settings file containing the data model and predicates to a file object Keyword arguments: file_obj -- file object to write settings data into """ pickle.dump(self.data_model, file_obj) pickle.dump(self.classifier, file_obj) pickle.dump(self.predicates, file_obj) pickle.dump(dict(self.stop_words), file_obj) def writeTraining(self, file_obj): # pragma : no cover """ Write to a json file that contains labeled examples Keyword arguments: file_obj -- file object to write training data to """ json.dump(self.training_pairs, file_obj, default=serializer._to_json, tuple_as_array=False, ensure_ascii=True) def uncertainPairs(self) : ''' Provides a list of the pairs of records that dedupe is most curious to learn if they are matches or distinct. Useful for user labeling. ''' if self.training_data.shape[0] == 0 : rand_int = random.randint(0, len(self.data_sample)-1) random_pair = self.data_sample[rand_int] exact_match = (random_pair[0], random_pair[0]) self._addTrainingData({u'match':[exact_match, exact_match], u'distinct':[random_pair]}) self._trainClassifier() bias = len(self.training_pairs[u'match']) if bias : bias /= (bias + len(self.training_pairs[u'distinct'])) min_examples = min(len(self.training_pairs[u'match']), len(self.training_pairs[u'distinct'])) regularizer = 10 bias = ((0.5 * min_examples + bias * regularizer) /(min_examples + regularizer)) return self.activeLearner.uncertainPairs(self.classifier, bias) def markPairs(self, labeled_pairs) : ''' Add a labeled pairs of record to dedupes training set and update the matching model Argument : labeled_pairs -- A dictionary with two keys, `match` and `distinct` the values are lists that can contain pairs of records ''' try : labeled_pairs.items() labeled_pairs[u'match'] labeled_pairs[u'distinct'] except : raise ValueError('labeled_pairs must be a dictionary with keys ' '"distinct" and "match"') if labeled_pairs[u'match'] : pair = labeled_pairs[u'match'][0] self._checkRecordPairType(pair) if labeled_pairs[u'distinct'] : pair = labeled_pairs[u'distinct'][0] self._checkRecordPairType(pair) if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match'] : warnings.warn("Didn't return any labeled record pairs") for label, pairs in labeled_pairs.items() : self.training_pairs[label].extend(core.freezeData(pairs)) self._addTrainingData(labeled_pairs) def _checkRecordPairType(self, record_pair) : try : record_pair[0] except : raise ValueError("The elements of data_sample must be pairs " "of record_pairs (ordered sequences of length 2)") if len(record_pair) != 2 : raise ValueError("The elements of data_sample must be pairs " "of record_pairs") try : record_pair[0].keys() and record_pair[1].keys() except : raise ValueError("A pair of record_pairs must be made up of two " "dictionaries ") self.data_model.check(record_pair[0]) self.data_model.check(record_pair[1]) def _checkDataSample(self, data_sample) : try : len(data_sample) except TypeError : raise ValueError("data_sample must be a sequence") if len(data_sample) : self._checkRecordPairType(data_sample[0]) else : warnings.warn("You submitted an empty data_sample") def _addTrainingData(self, labeled_pairs) : """ Appends training data to the training data collection. """ for label, examples in labeled_pairs.items () : n_examples = len(examples) labels = [label] * n_examples new_data = numpy.empty(n_examples, dtype=self.training_data.dtype) new_data['label'] = labels new_data['distances'] = self.data_model.distances(examples) self.training_data = numpy.append(self.training_data, new_data) def _loadSample(self, data_sample) : self._checkDataSample(data_sample) self.data_sample = data_sample self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model, self.num_cores)