def main(self) : data_d = {} # import the specified CSV file data_d = csvhelpers.readData(self.input, self.field_names) logging.info('imported %d rows', len(data_d)) # sanity check for provided field names in CSV file for field in self.field_definition : if self.field_definition[field]['type'] != 'Interaction' : if not field in data_d[0]: raise parser.error("Could not find field '" + field + "' in input") # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) data_sample = dedupe.dataSample(data_d, self.sample_size) logging.info('using fields: %s' % self.field_definition.keys()) # # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(self.field_definition) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) deduper.train(data_sample, str(self.training_file)) elif self.skip_training: raise parser.error("You need to provide an existing training_file or run this script without --skip_training") if not self.skip_training: logging.info('starting active labeling...') deduper.train(data_sample, labeler.label) # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) deduper.writeTraining(self.training_file) else: logging.info('skipping the training step') # ## Blocking logging.info('blocking...') # Initialize our blocker. We'll learn our blocking rules if we haven't # loaded them from a saved settings file. blocker = deduper.blockingFunction() # Load all the original data in to memory and place # them in to blocks. Each record can be blocked in many ways, so for # larger data, memory will be a limiting factor. blocked_data = dedupe.blockData(data_d, blocker) # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.goodThreshold(blocked_data, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.duplicateClusters(blocked_data, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) # write out our results if self.output_file : with open(self.output_file, 'w') as output_file : csvhelpers.writeResults(clustered_dupes, self.input, output_file) else : csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
print "number of known duplicate pairs", len(duplicates_s) if os.path.exists(settings_file) and 0: deduper = dedupe.Dedupe(settings_file) else: fields = {"title": {"type": "String"}} deduper = dedupe.Dedupe(fields) deduper.num_iterations = num_iterations print "Using a random sample of training pairs..." deduper._initializeTraining() deduper.training_pairs = randomTrainingPairs(data_d, duplicates_s, num_training_dupes, num_training_distinct) deduper.data_sample = dedupe.dataSample(data_d, 1000000, constrained_matching) deduper.training_data = dedupe.training.addTrainingData( deduper.training_pairs, deduper.data_model, deduper.training_data ) deduper.alpha = dedupe.crossvalidation.gridSearch( deduper.training_data, dedupe.core.trainModel, deduper.data_model, k=10 ) deduper.data_model = dedupe.core.trainModel(deduper.training_data, deduper.data_model, deduper.alpha) deduper._logLearnedWeights() print "blocking..."
'city': { 'type': 'String' } } deduper = dedupe.Dedupe(fields) deduper.num_iterations = num_iterations print "Using a random sample of training pairs..." deduper._initializeTraining() deduper.training_pairs = randomTrainingPairs(data_d, duplicates_s, num_training_dupes, num_training_distinct) deduper.data_sample = dedupe.dataSample(data_d, 1000000) deduper.training_data = dedupe.training.addTrainingData( deduper.training_pairs, deduper.data_model, deduper.training_data) deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data, dedupe.core.trainModel, deduper.data_model, k=10) deduper.data_model = dedupe.core.trainModel(deduper.training_data, deduper.data_model, deduper.alpha) deduper._logLearnedWeights()
return data_d print 'importing data ...' data_d = readData(input_file) # ## Training if os.path.exists(settings_file): print 'reading from', settings_file deduper = dedupe.Dedupe(settings_file) else: # To train dedupe, we feed it a random sample of records. data_sample = dedupe.dataSample(data_d, 150000) # Define the fields dedupe will pay attention to # # Notice how we are telling dedupe to use a custom field comparator # for the 'Zip' field. fields = { 'Site name': {'type': 'String'}, 'Address': {'type': 'String'}, 'Zip': {'type': 'Custom', 'comparator' : sameOrNotComparator, 'Has Missing' : True}, 'Phone': {'type': 'String', 'Has Missing' : True}, } # Create a new deduper object and pass our data model to it.
def main(self): data_d = {} # import the specified CSV file data_d = csvhelpers.readData(self.input, self.field_names) logging.info('imported %d rows', len(data_d)) # sanity check for provided field names in CSV file for field in self.field_definition: if self.field_definition[field]['type'] != 'Interaction': if not field in data_d[0]: raise parser.error("Could not find field '" + field + "' in input") # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) data_sample = dedupe.dataSample(data_d, self.sample_size) logging.info('using fields: %s' % self.field_definition.keys()) # # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(self.field_definition) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) deduper.train(data_sample, str(self.training_file)) elif self.skip_training: raise parser.error( "You need to provide an existing training_file or run this script without --skip_training" ) if not self.skip_training: logging.info('starting active labeling...') deduper.train(data_sample, labeler.label) # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) deduper.writeTraining(self.training_file) else: logging.info('skipping the training step') # ## Blocking logging.info('blocking...') # Initialize our blocker. We'll learn our blocking rules if we haven't # loaded them from a saved settings file. blocker = deduper.blockingFunction() # Load all the original data in to memory and place # them in to blocks. Each record can be blocked in many ways, so for # larger data, memory will be a limiting factor. blocked_data = dedupe.blockData(data_d, blocker) # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.goodThreshold(blocked_data, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.duplicateClusters(blocked_data, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) # write out our results if self.output_file: with open(self.output_file, 'w') as output_file: csvhelpers.writeResults(clustered_dupes, self.input, output_file) else: csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
if os.path.exists(settings_file): deduper = dedupe.Dedupe(settings_file) else: fields = {'title': {'type': 'String'}} deduper = dedupe.Dedupe(fields) deduper.num_iterations = num_iterations print "Using a random sample of training pairs..." deduper._initializeTraining() deduper.training_pairs = randomTrainingPairs(data_d, duplicates_s, num_training_dupes, num_training_distinct) deduper.data_sample = dedupe.dataSample(data_d, 1000000, constrained_matching) deduper.training_data = dedupe.training.addTrainingData( deduper.training_pairs, deduper.data_model, deduper.training_data) deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data, dedupe.core.trainModel, deduper.data_model, k=10) deduper.data_model = dedupe.core.trainModel(deduper.training_data, deduper.data_model, deduper.alpha) deduper._logLearnedWeights()
# Read in the data input_df = pd.read_csv(input_file) data_d = readDataFrame(input_df) # Training time_start = time.time() if os.path.exists(settings_file): print 'reading from', settings_file deduper = dedupe.Dedupe(settings_file) else: # To train dedupe, we feed it a random sample of records. data_sample = dedupe.dataSample(data_d, 10 * input_df.shape[0]) # Define the fields dedupe will pay attention to fields = {'first': {'type': 'String'}, 'last': {'type': 'String'}, 'mi':{'type': 'String'}# , # 'birth_year': {'type': 'Custom', 'comparator': sameYear} } # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) if os.path.exists(training_file): print 'reading labeled examples from ', training_file deduper.train(data_sample, training_file)
'cuisine': {'type': 'String'}, 'city' : {'type' : 'String'} } deduper = dedupe.Dedupe(fields) deduper.num_iterations = num_iterations print "Using a random sample of training pairs..." deduper._initializeTraining() deduper.training_pairs = randomTrainingPairs(data_d, duplicates_s, num_training_dupes, num_training_distinct) deduper.data_sample = dedupe.dataSample(data_d, 1000000) deduper.training_data = dedupe.training.addTrainingData(deduper.training_pairs, deduper.data_model, deduper.training_data) deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data, dedupe.core.trainModel, deduper.data_model, k=10) deduper.data_model = dedupe.core.trainModel(deduper.training_data, deduper.data_model, deduper.alpha)
return data_d print 'importing data ...' data_d = readData(input_file) # ## Training if os.path.exists(settings_file): print 'reading from', settings_file deduper = dedupe.Dedupe(settings_file) else: # To train dedupe, we feed it a random sample of records. data_sample = dedupe.dataSample(data_d, 150000) # Define the fields dedupe will pay attention to # # Notice how we are telling dedupe to use a custom field comparator # for the 'Zip' field. fields = { 'Site name': { 'type': 'String' }, 'Address': { 'type': 'String' }, 'Zip': { 'type': 'Custom', 'comparator': sameOrNotComparator,