コード例 #1
0
    deduper.training_data = dedupe.training.addTrainingData(
        deduper.training_pairs, deduper.data_model, deduper.training_data
    )

    deduper.alpha = dedupe.crossvalidation.gridSearch(
        deduper.training_data, dedupe.core.trainModel, deduper.data_model, k=10
    )

    deduper.data_model = dedupe.core.trainModel(deduper.training_data, deduper.data_model, deduper.alpha)

    deduper._logLearnedWeights()


print "blocking..."
blocker = deduper.blockingFunction(constrained_matching, ppc=0.0001, uncovered_dupes=0)
blocked_data = tuple(dedupe.blockData(data_d, blocker, constrained_matching))

alpha = deduper.goodThreshold(blocked_data, constrained_matching)


# print candidates
print "clustering..."
clustered_dupes = deduper.duplicateClusters(blocked_data, data_d, constrained_matching, threshold=alpha)


deduper.writeSettings(settings_file)

print "Evaluate Scoring"
found_dupes = set([frozenset(pair) for (pair, score) in deduper.dupes if score > alpha])

evaluateDuplicates(found_dupes, duplicates_s)
コード例 #2
0
ファイル: csv_example.py プロジェクト: BrianSipple/dedupe
print 'blocking...'
# Initialize our blocker. We'll learn our blocking rules if we haven't
# loaded them from a saved settings file.
blocker = deduper.blockingFunction()

# Save our weights and predicates to disk.  If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
deduper.writeSettings(settings_file)

# Load all the original data in to memory and place
# them in to blocks. Each record can be blocked in many ways, so for
# larger data, memory will be a limiting factor.

blocked_data = dedupe.blockData(data_d, blocker)

# ## Clustering

# Find the threshold that will maximize a weighted average of our precision and recall. 
# When we set the recall weight to 2, we are saying we care twice as much
# about recall as we do precision.
#
# If we had more data, we would not pass in all the blocked data into
# this function but a representative sample.

threshold = deduper.goodThreshold(blocked_data, recall_weight=2)

# `duplicateClusters` will return sets of record IDs that dedupe
# believes are all referring to the same entity.
コード例 #3
0
ファイル: csvdedupe.py プロジェクト: PereiraM/csvdedupe
  def main(self) :

    data_d = {}
    # import the specified CSV file

    data_d = csvhelpers.readData(self.input, self.field_names)
    
    logging.info('imported %d rows', len(data_d))

    # sanity check for provided field names in CSV file
    for field in self.field_definition :
      if self.field_definition[field]['type'] != 'Interaction' :
        if not field in data_d[0]:
        
          raise parser.error("Could not find field '" + field + "' in input")

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    data_sample = dedupe.dataSample(data_d, self.sample_size)

    logging.info('using fields: %s' % self.field_definition.keys())
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(self.field_definition)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      deduper.train(data_sample, str(self.training_file))
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      deduper.train(data_sample, labeler.label)

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      deduper.writeTraining(self.training_file)
    else:
      logging.info('skipping the training step')

    # ## Blocking

    logging.info('blocking...')
    # Initialize our blocker. We'll learn our blocking rules if we haven't
    # loaded them from a saved settings file.
    blocker = deduper.blockingFunction()

    # Load all the original data in to memory and place
    # them in to blocks. Each record can be blocked in many ways, so for
    # larger data, memory will be a limiting factor.

    blocked_data = dedupe.blockData(data_d, blocker)

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.goodThreshold(blocked_data, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.duplicateClusters(blocked_data, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    # write out our results
    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        csvhelpers.writeResults(clustered_dupes, self.input, output_file)
    else :
        csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
コード例 #4
0
        deduper.training_pairs, deduper.data_model, deduper.training_data)

    deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data,
                                                      dedupe.core.trainModel,
                                                      deduper.data_model,
                                                      k=10)

    deduper.data_model = dedupe.core.trainModel(deduper.training_data,
                                                deduper.data_model,
                                                deduper.alpha)

    deduper._logLearnedWeights()

print 'blocking...'
blocker = deduper.blockingFunction(ppc=1, uncovered_dupes=1)
blocked_data = tuple(dedupe.blockData(data_d, blocker))

alpha = deduper.goodThreshold(blocked_data)

# print candidates
print 'clustering...'
clustered_dupes = deduper.duplicateClusters(blocked_data, threshold=alpha)

deduper.writeSettings(settings_file)

print 'Evaluate Scoring'
found_dupes = set(
    [frozenset(pair) for (pair, score) in deduper.dupes if score > alpha])

evaluateDuplicates(found_dupes, duplicates_s)
コード例 #5
0
ファイル: csvdedupe.py プロジェクト: jeremyjbowers/csvdedupe
    def main(self):

        data_d = {}
        # import the specified CSV file

        data_d = csvhelpers.readData(self.input, self.field_names)

        logging.info('imported %d rows', len(data_d))

        # sanity check for provided field names in CSV file
        for field in self.field_definition:
            if self.field_definition[field]['type'] != 'Interaction':
                if not field in data_d[0]:

                    raise parser.error("Could not find field '" + field +
                                       "' in input")

        # Set up our data sample
        logging.info('taking a sample of %d possible pairs', self.sample_size)
        data_sample = dedupe.dataSample(data_d, self.sample_size)

        logging.info('using fields: %s' % self.field_definition.keys())
        # # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(self.field_definition)

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file

        if os.path.exists(self.training_file):
            logging.info('reading labeled examples from %s' %
                         self.training_file)
            deduper.train(data_sample, str(self.training_file))
        elif self.skip_training:
            raise parser.error(
                "You need to provide an existing training_file or run this script without --skip_training"
            )

        if not self.skip_training:
            logging.info('starting active labeling...')

            deduper.train(data_sample, labeler.label)

            # When finished, save our training away to disk
            logging.info('saving training data to %s' % self.training_file)
            deduper.writeTraining(self.training_file)
        else:
            logging.info('skipping the training step')

        # ## Blocking

        logging.info('blocking...')
        # Initialize our blocker. We'll learn our blocking rules if we haven't
        # loaded them from a saved settings file.
        blocker = deduper.blockingFunction()

        # Load all the original data in to memory and place
        # them in to blocks. Each record can be blocked in many ways, so for
        # larger data, memory will be a limiting factor.

        blocked_data = dedupe.blockData(data_d, blocker)

        # ## Clustering

        # Find the threshold that will maximize a weighted average of our precision and recall.
        # When we set the recall weight to 2, we are saying we care twice as much
        # about recall as we do precision.
        #
        # If we had more data, we would not pass in all the blocked data into
        # this function but a representative sample.

        logging.info('finding a good threshold with a recall_weight of %s' %
                     self.recall_weight)
        threshold = deduper.goodThreshold(blocked_data,
                                          recall_weight=self.recall_weight)

        # `duplicateClusters` will return sets of record IDs that dedupe
        # believes are all referring to the same entity.

        logging.info('clustering...')
        clustered_dupes = deduper.duplicateClusters(blocked_data, threshold)

        logging.info('# duplicate sets %s' % len(clustered_dupes))

        # write out our results
        if self.output_file:
            with open(self.output_file, 'w') as output_file:
                csvhelpers.writeResults(clustered_dupes, self.input,
                                        output_file)
        else:
            csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
コード例 #6
0
    deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data,
                                                      dedupe.core.trainModel,
                                                      deduper.data_model,
                                                      k=10)

    deduper.data_model = dedupe.core.trainModel(deduper.training_data,
                                                deduper.data_model,
                                                deduper.alpha)

    deduper._logLearnedWeights()

print 'blocking...'
blocker = deduper.blockingFunction(constrained_matching,
                                   ppc=.0001,
                                   uncovered_dupes=0)
blocked_data = tuple(dedupe.blockData(data_d, blocker, constrained_matching))

alpha = deduper.goodThreshold(blocked_data, constrained_matching)

# print candidates
print 'clustering...'
clustered_dupes = deduper.duplicateClusters(blocked_data,
                                            data_d,
                                            constrained_matching,
                                            threshold=alpha)

deduper.writeSettings(settings_file)

print 'Evaluate Scoring'
found_dupes = set(
    [frozenset(pair) for (pair, score) in deduper.dupes if score > alpha])
コード例 #7
0
    deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data,
                                                      dedupe.core.trainModel,
                                                      deduper.data_model,
                                                      k=10)

    deduper.data_model = dedupe.core.trainModel(deduper.training_data,
                                                deduper.data_model,
                                                deduper.alpha)

    deduper._logLearnedWeights()


print 'blocking...'
blocker = deduper.blockingFunction(ppc=.0001, uncovered_dupes=0)
blocked_data = tuple(dedupe.blockData(data_d, blocker))

alpha = deduper.goodThreshold(blocked_data)


# print candidates
print 'clustering...'
clustered_dupes = deduper.duplicateClusters(blocked_data,
                                            data_d,
                                            threshold=alpha)


deduper.writeSettings(settings_file)

print 'Evaluate Scoring'
found_dupes = set([frozenset(pair) for (pair, score) in deduper.dupes
コード例 #8
0
print 'blocking...'
# Initialize our blocker. We'll learn our blocking rules if we haven't
# loaded them from a saved settings file.
blocker = deduper.blockingFunction()

# Save our weights and predicates to disk.  If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
deduper.writeSettings(settings_file)

# Load all the original data in to memory and place
# them in to blocks. Each record can be blocked in many ways, so for
# larger data, memory will be a limiting factor.

blocked_data = dedupe.blockData(data_d, blocker)

# ## Clustering

# Find the threshold that will maximize a weighted average of our precision and recall.
# When we set the recall weight to 2, we are saying we care twice as much
# about recall as we do precision.
#
# If we had more data, we would not pass in all the blocked data into
# this function but a representative sample.

threshold = deduper.goodThreshold(blocked_data, recall_weight=2)

# `duplicateClusters` will return sets of record IDs that dedupe
# believes are all referring to the same entity.