deduper.training_data = dedupe.training.addTrainingData( deduper.training_pairs, deduper.data_model, deduper.training_data ) deduper.alpha = dedupe.crossvalidation.gridSearch( deduper.training_data, dedupe.core.trainModel, deduper.data_model, k=10 ) deduper.data_model = dedupe.core.trainModel(deduper.training_data, deduper.data_model, deduper.alpha) deduper._logLearnedWeights() print "blocking..." blocker = deduper.blockingFunction(constrained_matching, ppc=0.0001, uncovered_dupes=0) blocked_data = tuple(dedupe.blockData(data_d, blocker, constrained_matching)) alpha = deduper.goodThreshold(blocked_data, constrained_matching) # print candidates print "clustering..." clustered_dupes = deduper.duplicateClusters(blocked_data, data_d, constrained_matching, threshold=alpha) deduper.writeSettings(settings_file) print "Evaluate Scoring" found_dupes = set([frozenset(pair) for (pair, score) in deduper.dupes if score > alpha]) evaluateDuplicates(found_dupes, duplicates_s)
print 'blocking...' # Initialize our blocker. We'll learn our blocking rules if we haven't # loaded them from a saved settings file. blocker = deduper.blockingFunction() # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. deduper.writeSettings(settings_file) # Load all the original data in to memory and place # them in to blocks. Each record can be blocked in many ways, so for # larger data, memory will be a limiting factor. blocked_data = dedupe.blockData(data_d, blocker) # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. threshold = deduper.goodThreshold(blocked_data, recall_weight=2) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity.
def main(self) : data_d = {} # import the specified CSV file data_d = csvhelpers.readData(self.input, self.field_names) logging.info('imported %d rows', len(data_d)) # sanity check for provided field names in CSV file for field in self.field_definition : if self.field_definition[field]['type'] != 'Interaction' : if not field in data_d[0]: raise parser.error("Could not find field '" + field + "' in input") # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) data_sample = dedupe.dataSample(data_d, self.sample_size) logging.info('using fields: %s' % self.field_definition.keys()) # # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(self.field_definition) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) deduper.train(data_sample, str(self.training_file)) elif self.skip_training: raise parser.error("You need to provide an existing training_file or run this script without --skip_training") if not self.skip_training: logging.info('starting active labeling...') deduper.train(data_sample, labeler.label) # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) deduper.writeTraining(self.training_file) else: logging.info('skipping the training step') # ## Blocking logging.info('blocking...') # Initialize our blocker. We'll learn our blocking rules if we haven't # loaded them from a saved settings file. blocker = deduper.blockingFunction() # Load all the original data in to memory and place # them in to blocks. Each record can be blocked in many ways, so for # larger data, memory will be a limiting factor. blocked_data = dedupe.blockData(data_d, blocker) # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.goodThreshold(blocked_data, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.duplicateClusters(blocked_data, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) # write out our results if self.output_file : with open(self.output_file, 'w') as output_file : csvhelpers.writeResults(clustered_dupes, self.input, output_file) else : csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
deduper.training_pairs, deduper.data_model, deduper.training_data) deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data, dedupe.core.trainModel, deduper.data_model, k=10) deduper.data_model = dedupe.core.trainModel(deduper.training_data, deduper.data_model, deduper.alpha) deduper._logLearnedWeights() print 'blocking...' blocker = deduper.blockingFunction(ppc=1, uncovered_dupes=1) blocked_data = tuple(dedupe.blockData(data_d, blocker)) alpha = deduper.goodThreshold(blocked_data) # print candidates print 'clustering...' clustered_dupes = deduper.duplicateClusters(blocked_data, threshold=alpha) deduper.writeSettings(settings_file) print 'Evaluate Scoring' found_dupes = set( [frozenset(pair) for (pair, score) in deduper.dupes if score > alpha]) evaluateDuplicates(found_dupes, duplicates_s)
def main(self): data_d = {} # import the specified CSV file data_d = csvhelpers.readData(self.input, self.field_names) logging.info('imported %d rows', len(data_d)) # sanity check for provided field names in CSV file for field in self.field_definition: if self.field_definition[field]['type'] != 'Interaction': if not field in data_d[0]: raise parser.error("Could not find field '" + field + "' in input") # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) data_sample = dedupe.dataSample(data_d, self.sample_size) logging.info('using fields: %s' % self.field_definition.keys()) # # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(self.field_definition) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) deduper.train(data_sample, str(self.training_file)) elif self.skip_training: raise parser.error( "You need to provide an existing training_file or run this script without --skip_training" ) if not self.skip_training: logging.info('starting active labeling...') deduper.train(data_sample, labeler.label) # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) deduper.writeTraining(self.training_file) else: logging.info('skipping the training step') # ## Blocking logging.info('blocking...') # Initialize our blocker. We'll learn our blocking rules if we haven't # loaded them from a saved settings file. blocker = deduper.blockingFunction() # Load all the original data in to memory and place # them in to blocks. Each record can be blocked in many ways, so for # larger data, memory will be a limiting factor. blocked_data = dedupe.blockData(data_d, blocker) # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.goodThreshold(blocked_data, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.duplicateClusters(blocked_data, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) # write out our results if self.output_file: with open(self.output_file, 'w') as output_file: csvhelpers.writeResults(clustered_dupes, self.input, output_file) else: csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data, dedupe.core.trainModel, deduper.data_model, k=10) deduper.data_model = dedupe.core.trainModel(deduper.training_data, deduper.data_model, deduper.alpha) deduper._logLearnedWeights() print 'blocking...' blocker = deduper.blockingFunction(constrained_matching, ppc=.0001, uncovered_dupes=0) blocked_data = tuple(dedupe.blockData(data_d, blocker, constrained_matching)) alpha = deduper.goodThreshold(blocked_data, constrained_matching) # print candidates print 'clustering...' clustered_dupes = deduper.duplicateClusters(blocked_data, data_d, constrained_matching, threshold=alpha) deduper.writeSettings(settings_file) print 'Evaluate Scoring' found_dupes = set( [frozenset(pair) for (pair, score) in deduper.dupes if score > alpha])
deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data, dedupe.core.trainModel, deduper.data_model, k=10) deduper.data_model = dedupe.core.trainModel(deduper.training_data, deduper.data_model, deduper.alpha) deduper._logLearnedWeights() print 'blocking...' blocker = deduper.blockingFunction(ppc=.0001, uncovered_dupes=0) blocked_data = tuple(dedupe.blockData(data_d, blocker)) alpha = deduper.goodThreshold(blocked_data) # print candidates print 'clustering...' clustered_dupes = deduper.duplicateClusters(blocked_data, data_d, threshold=alpha) deduper.writeSettings(settings_file) print 'Evaluate Scoring' found_dupes = set([frozenset(pair) for (pair, score) in deduper.dupes