def main(self) : data_d = {} # import the specified CSV file data_d = csvhelpers.readData(self.input, self.field_names) logging.info('imported %d rows', len(data_d)) # sanity check for provided field names in CSV file for field in self.field_definition : if self.field_definition[field]['type'] != 'Interaction' : if not field in data_d[0]: raise parser.error("Could not find field '" + field + "' in input") # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) data_sample = dedupe.dataSample(data_d, self.sample_size) logging.info('using fields: %s' % self.field_definition.keys()) # # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(self.field_definition) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) deduper.train(data_sample, str(self.training_file)) elif self.skip_training: raise parser.error("You need to provide an existing training_file or run this script without --skip_training") if not self.skip_training: logging.info('starting active labeling...') deduper.train(data_sample, labeler.label) # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) deduper.writeTraining(self.training_file) else: logging.info('skipping the training step') # ## Blocking logging.info('blocking...') # Initialize our blocker. We'll learn our blocking rules if we haven't # loaded them from a saved settings file. blocker = deduper.blockingFunction() # Load all the original data in to memory and place # them in to blocks. Each record can be blocked in many ways, so for # larger data, memory will be a limiting factor. blocked_data = dedupe.blockData(data_d, blocker) # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.goodThreshold(blocked_data, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.duplicateClusters(blocked_data, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) # write out our results if self.output_file : with open(self.output_file, 'w') as output_file : csvhelpers.writeResults(clustered_dupes, self.input, output_file) else : csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
def main(self) : data_1 = {} data_2 = {} # import the specified CSV file data_1 = csvhelpers.readData(self.input_1, self.field_names_1, prefix='input_1') data_2 = csvhelpers.readData(self.input_2, self.field_names_2, prefix='input_2') # sanity check for provided field names in CSV file for field in self.field_names_1 : if field not in data_1.values()[0]: raise parser.error("Could not find field '" + field + "' in input") for field in self.field_names_2 : if field not in data_2.values()[0]: raise parser.error("Could not find field '" + field + "' in input") if self.field_names_1 != self.field_names_2 : for record_id, record in data_2.items() : remapped_record = {} for new_field, old_field in zip(self.field_names_1, self.field_names_2) : remapped_record[new_field] = record[old_field] data_2[record_id] = remapped_record logging.info('imported %d rows from file 1', len(data_1)) logging.info('imported %d rows from file 2', len(data_2)) logging.info('using fields: %s' % [field['field'] for field in self.field_definition]) # # Create a new deduper object and pass our data model to it. deduper = dedupe.RecordLink(self.field_definition) # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) deduper.sample(data_1, data_2, self.sample_size) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) with open(self.training_file) as tf : deduper.readTraining(tf) elif self.skip_training: raise parser.error("You need to provide an existing training_file or run this script without --skip_training") if not self.skip_training: logging.info('starting active labeling...') csvhelpers.consoleLabel(deduper) # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) with open(self.training_file, 'w') as tf : deduper.writeTraining(tf) else: logging.info('skipping the training step') deduper.train() # ## Blocking logging.info('blocking...') # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.match(data_1, data_2, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) write_function = csvhelpers.writeLinkedResults # write out our results if self.output_file : with open(self.output_file, 'w') as output_file : write_function(clustered_dupes, self.input_1, self.input_2, output_file, self.inner_join) else : write_function(clustered_dupes, self.input_1, self.input_2, sys.stdout, self.inner_join)
def main(self): data_d = {} # import the specified CSV file timer.elapsed('Starting file read: ') data_d = csvhelpers.readData(self.input, self.field_names) timer.elapsed('Finished file read: ') logging.info('imported %d rows', len(data_d)) # sanity check for provided field names in CSV file for field in self.field_definition: if field['type'] != 'Interaction': if not field['field'] in data_d[0]: raise self.parser.error("Could not find field '" + field['field'] + "' in input") logging.info('using fields: %s' % [field['field'] for field in self.field_definition]) # If --skip_training has been selected, and we have a settings cache still # persisting from the last run, use it in this next run. # __Note:__ if you want to add more training data, don't use skip training if self.skip_training and os.path.exists(self.settings_file): # Load our deduper from the last training session cache. logging.info('reading from previous training cache %s' % self.settings_file) with open(self.settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) fields = { variable.field for variable in deduper.data_model.primary_fields } unique_d, parents = exact_matches(data_d, fields) else: # # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(self.field_definition) fields = { variable.field for variable in deduper.data_model.primary_fields } unique_d, parents = exact_matches(data_d, fields) # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) deduper.sample(unique_d, self.sample_size) # Perform standard training procedures self.dedupe_training(deduper) # ## Blocking logging.info('blocking...') # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.threshold(unique_d, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.match(unique_d, threshold) expanded_clustered_dupes = [] for cluster, scores in clustered_dupes: new_cluster = list(cluster) new_scores = list(scores) for row_id, score in zip(cluster, scores): children = parents.get(row_id, []) new_cluster.extend(children) new_scores.extend([score] * len(children)) expanded_clustered_dupes.append((new_cluster, new_scores)) clustered_dupes = expanded_clustered_dupes logging.info('# duplicate sets %s' % len(clustered_dupes)) write_function = csvhelpers.writeResults # write out our results if self.destructive: write_function = csvhelpers.writeUniqueResults if self.output_file: with open(self.output_file, 'w', encoding='utf-8') as output_file: write_function(clustered_dupes, self.input, output_file) else: if sys.version < '3': out = codecs.getwriter(locale.getpreferredencoding())( sys.stdout) write_function(clustered_dupes, self.input, out) else: write_function(clustered_dupes, self.input, sys.stdout)
def main(self): data_d = {} # import the specified CSV file data_d = csvhelpers.readData(self.input, self.field_names) logging.info('imported %d rows', len(data_d)) # sanity check for provided field names in CSV file for field in self.field_definition: if self.field_definition[field]['type'] != 'Interaction': if not field in data_d[0]: raise parser.error("Could not find field '" + field + "' in input") # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) data_sample = dedupe.dataSample(data_d, self.sample_size) logging.info('using fields: %s' % self.field_definition.keys()) # # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(self.field_definition) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) deduper.train(data_sample, str(self.training_file)) elif self.skip_training: raise parser.error( "You need to provide an existing training_file or run this script without --skip_training" ) if not self.skip_training: logging.info('starting active labeling...') deduper.train(data_sample, labeler.label) # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) deduper.writeTraining(self.training_file) else: logging.info('skipping the training step') # ## Blocking logging.info('blocking...') # Initialize our blocker. We'll learn our blocking rules if we haven't # loaded them from a saved settings file. blocker = deduper.blockingFunction() # Load all the original data in to memory and place # them in to blocks. Each record can be blocked in many ways, so for # larger data, memory will be a limiting factor. blocked_data = dedupe.blockData(data_d, blocker) # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.goodThreshold(blocked_data, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.duplicateClusters(blocked_data, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) # write out our results if self.output_file: with open(self.output_file, 'w') as output_file: csvhelpers.writeResults(clustered_dupes, self.input, output_file) else: csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
def main(self) : data_1 = {} data_2 = {} # import the specified CSV file data_1 = csvhelpers.readData(self.input_1, self.field_names_1, prefix='input_1') data_2 = csvhelpers.readData(self.input_2, self.field_names_2, prefix='input_2') # sanity check for provided field names in CSV file for field in self.field_names_1 : if field not in data_1.values()[0]: raise parser.error("Could not find field '" + field + "' in input") for field in self.field_names_2 : if field not in data_2.values()[0]: raise parser.error("Could not find field '" + field + "' in input") if self.field_names_1 != self.field_names_2 : for record_id, record in data_2.items() : remapped_record = {} for new_field, old_field in zip(self.field_names_1, self.field_names_2) : remapped_record[new_field] = record[old_field] data_2[record_id] = remapped_record logging.info('imported %d rows from file 1', len(data_1)) logging.info('imported %d rows from file 2', len(data_2)) logging.info('using fields: %s' % self.field_definition.keys()) # # Create a new deduper object and pass our data model to it. deduper = dedupe.RecordLink(self.field_definition) # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) deduper.sample(data_1, data_2, self.sample_size) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) deduper.readTraining(self.training_file) elif self.skip_training: raise parser.error("You need to provide an existing training_file or run this script without --skip_training") if not self.skip_training: logging.info('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) deduper.writeTraining(self.training_file) else: logging.info('skipping the training step') deduper.train() # ## Blocking logging.info('blocking...') # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.match(data_1, data_2, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) write_function = csvhelpers.writeLinkedResults # write out our results if self.output_file : with open(self.output_file, 'w') as output_file : write_function(clustered_dupes, self.input_1, self.input_2, output_file, self.inner_join) else : write_function(clustered_dupes, self.input_1, self.input_2, sys.stdout, self.inner_join)