Example #1
0
  def main(self) :

    data_d = {}
    # import the specified CSV file

    data_d = csvhelpers.readData(self.input, self.field_names)
    
    logging.info('imported %d rows', len(data_d))

    # sanity check for provided field names in CSV file
    for field in self.field_definition :
      if self.field_definition[field]['type'] != 'Interaction' :
        if not field in data_d[0]:
        
          raise parser.error("Could not find field '" + field + "' in input")

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    data_sample = dedupe.dataSample(data_d, self.sample_size)

    logging.info('using fields: %s' % self.field_definition.keys())
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(self.field_definition)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      deduper.train(data_sample, str(self.training_file))
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      deduper.train(data_sample, labeler.label)

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      deduper.writeTraining(self.training_file)
    else:
      logging.info('skipping the training step')

    # ## Blocking

    logging.info('blocking...')
    # Initialize our blocker. We'll learn our blocking rules if we haven't
    # loaded them from a saved settings file.
    blocker = deduper.blockingFunction()

    # Load all the original data in to memory and place
    # them in to blocks. Each record can be blocked in many ways, so for
    # larger data, memory will be a limiting factor.

    blocked_data = dedupe.blockData(data_d, blocker)

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.goodThreshold(blocked_data, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.duplicateClusters(blocked_data, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    # write out our results
    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        csvhelpers.writeResults(clustered_dupes, self.input, output_file)
    else :
        csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
Example #2
0
  def main(self) :

    data_1 = {}
    data_2 = {}
    # import the specified CSV file

    data_1 = csvhelpers.readData(self.input_1, 
                                 self.field_names_1,
                                 prefix='input_1')
    data_2 = csvhelpers.readData(self.input_2, 
                                 self.field_names_2,
                                 prefix='input_2')

    # sanity check for provided field names in CSV file
    for field in self.field_names_1 :
      if field not in data_1.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")

    for field in self.field_names_2 :
      if field not in data_2.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")


    if self.field_names_1 != self.field_names_2 :
      for record_id, record in data_2.items() :
        remapped_record = {}
        for new_field, old_field in zip(self.field_names_1, self.field_names_2) :
          remapped_record[new_field] = record[old_field]
        data_2[record_id] = remapped_record
    
    logging.info('imported %d rows from file 1', len(data_1))
    logging.info('imported %d rows from file 2', len(data_2))

    logging.info('using fields: %s' % [field['field'] for 
                                       field in self.field_definition])
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.RecordLink(self.field_definition)

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    deduper.sample(data_1, data_2, self.sample_size)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      with open(self.training_file) as tf :
        deduper.readTraining(tf)
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      csvhelpers.consoleLabel(deduper)

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      with open(self.training_file, 'w') as tf :
        deduper.writeTraining(tf)
    else:
      logging.info('skipping the training step')

    deduper.train()

    # ## Blocking

    logging.info('blocking...')

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.match(data_1, data_2, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    write_function = csvhelpers.writeLinkedResults
    # write out our results

    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       output_file,
                       self.inner_join)
    else :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       sys.stdout,
                       self.inner_join)
Example #3
0
    def main(self):

        data_d = {}
        # import the specified CSV file

        timer.elapsed('Starting file read: ')
        data_d = csvhelpers.readData(self.input, self.field_names)
        timer.elapsed('Finished file read: ')

        logging.info('imported %d rows', len(data_d))

        # sanity check for provided field names in CSV file
        for field in self.field_definition:
            if field['type'] != 'Interaction':
                if not field['field'] in data_d[0]:

                    raise self.parser.error("Could not find field '" +
                                            field['field'] + "' in input")

        logging.info('using fields: %s' %
                     [field['field'] for field in self.field_definition])

        # If --skip_training has been selected, and we have a settings cache still
        # persisting from the last run, use it in this next run.
        # __Note:__ if you want to add more training data, don't use skip training
        if self.skip_training and os.path.exists(self.settings_file):

            # Load our deduper from the last training session cache.
            logging.info('reading from previous training cache %s' %
                         self.settings_file)
            with open(self.settings_file, 'rb') as f:
                deduper = dedupe.StaticDedupe(f)

            fields = {
                variable.field
                for variable in deduper.data_model.primary_fields
            }
            unique_d, parents = exact_matches(data_d, fields)

        else:
            # # Create a new deduper object and pass our data model to it.
            deduper = dedupe.Dedupe(self.field_definition)

            fields = {
                variable.field
                for variable in deduper.data_model.primary_fields
            }
            unique_d, parents = exact_matches(data_d, fields)

            # Set up our data sample
            logging.info('taking a sample of %d possible pairs',
                         self.sample_size)
            deduper.sample(unique_d, self.sample_size)

            # Perform standard training procedures
            self.dedupe_training(deduper)

        # ## Blocking

        logging.info('blocking...')

        # ## Clustering

        # Find the threshold that will maximize a weighted average of our precision and recall.
        # When we set the recall weight to 2, we are saying we care twice as much
        # about recall as we do precision.
        #
        # If we had more data, we would not pass in all the blocked data into
        # this function but a representative sample.

        logging.info('finding a good threshold with a recall_weight of %s' %
                     self.recall_weight)
        threshold = deduper.threshold(unique_d,
                                      recall_weight=self.recall_weight)

        # `duplicateClusters` will return sets of record IDs that dedupe
        # believes are all referring to the same entity.

        logging.info('clustering...')
        clustered_dupes = deduper.match(unique_d, threshold)

        expanded_clustered_dupes = []
        for cluster, scores in clustered_dupes:
            new_cluster = list(cluster)
            new_scores = list(scores)
            for row_id, score in zip(cluster, scores):
                children = parents.get(row_id, [])
                new_cluster.extend(children)
                new_scores.extend([score] * len(children))
            expanded_clustered_dupes.append((new_cluster, new_scores))

        clustered_dupes = expanded_clustered_dupes

        logging.info('# duplicate sets %s' % len(clustered_dupes))

        write_function = csvhelpers.writeResults
        # write out our results
        if self.destructive:
            write_function = csvhelpers.writeUniqueResults

        if self.output_file:
            with open(self.output_file, 'w', encoding='utf-8') as output_file:
                write_function(clustered_dupes, self.input, output_file)
        else:
            if sys.version < '3':
                out = codecs.getwriter(locale.getpreferredencoding())(
                    sys.stdout)
                write_function(clustered_dupes, self.input, out)
            else:
                write_function(clustered_dupes, self.input, sys.stdout)
Example #4
0
    def main(self):

        data_d = {}
        # import the specified CSV file

        data_d = csvhelpers.readData(self.input, self.field_names)

        logging.info('imported %d rows', len(data_d))

        # sanity check for provided field names in CSV file
        for field in self.field_definition:
            if self.field_definition[field]['type'] != 'Interaction':
                if not field in data_d[0]:

                    raise parser.error("Could not find field '" + field +
                                       "' in input")

        # Set up our data sample
        logging.info('taking a sample of %d possible pairs', self.sample_size)
        data_sample = dedupe.dataSample(data_d, self.sample_size)

        logging.info('using fields: %s' % self.field_definition.keys())
        # # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(self.field_definition)

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file

        if os.path.exists(self.training_file):
            logging.info('reading labeled examples from %s' %
                         self.training_file)
            deduper.train(data_sample, str(self.training_file))
        elif self.skip_training:
            raise parser.error(
                "You need to provide an existing training_file or run this script without --skip_training"
            )

        if not self.skip_training:
            logging.info('starting active labeling...')

            deduper.train(data_sample, labeler.label)

            # When finished, save our training away to disk
            logging.info('saving training data to %s' % self.training_file)
            deduper.writeTraining(self.training_file)
        else:
            logging.info('skipping the training step')

        # ## Blocking

        logging.info('blocking...')
        # Initialize our blocker. We'll learn our blocking rules if we haven't
        # loaded them from a saved settings file.
        blocker = deduper.blockingFunction()

        # Load all the original data in to memory and place
        # them in to blocks. Each record can be blocked in many ways, so for
        # larger data, memory will be a limiting factor.

        blocked_data = dedupe.blockData(data_d, blocker)

        # ## Clustering

        # Find the threshold that will maximize a weighted average of our precision and recall.
        # When we set the recall weight to 2, we are saying we care twice as much
        # about recall as we do precision.
        #
        # If we had more data, we would not pass in all the blocked data into
        # this function but a representative sample.

        logging.info('finding a good threshold with a recall_weight of %s' %
                     self.recall_weight)
        threshold = deduper.goodThreshold(blocked_data,
                                          recall_weight=self.recall_weight)

        # `duplicateClusters` will return sets of record IDs that dedupe
        # believes are all referring to the same entity.

        logging.info('clustering...')
        clustered_dupes = deduper.duplicateClusters(blocked_data, threshold)

        logging.info('# duplicate sets %s' % len(clustered_dupes))

        # write out our results
        if self.output_file:
            with open(self.output_file, 'w') as output_file:
                csvhelpers.writeResults(clustered_dupes, self.input,
                                        output_file)
        else:
            csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
Example #5
0
  def main(self) :

    data_1 = {}
    data_2 = {}
    # import the specified CSV file

    data_1 = csvhelpers.readData(self.input_1, 
                                 self.field_names_1,
                                 prefix='input_1')
    data_2 = csvhelpers.readData(self.input_2, 
                                 self.field_names_2,
                                 prefix='input_2')

    # sanity check for provided field names in CSV file
    for field in self.field_names_1 :
      if field not in data_1.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")

    for field in self.field_names_2 :
      if field not in data_2.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")


    if self.field_names_1 != self.field_names_2 :
      for record_id, record in data_2.items() :
        remapped_record = {}
        for new_field, old_field in zip(self.field_names_1, self.field_names_2) :
          remapped_record[new_field] = record[old_field]
        data_2[record_id] = remapped_record
    
    logging.info('imported %d rows from file 1', len(data_1))
    logging.info('imported %d rows from file 2', len(data_2))

    logging.info('using fields: %s' % self.field_definition.keys())
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.RecordLink(self.field_definition)

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    deduper.sample(data_1, data_2, self.sample_size)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      deduper.readTraining(self.training_file)
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      dedupe.consoleLabel(deduper)
      deduper.train()

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      deduper.writeTraining(self.training_file)
    else:
      logging.info('skipping the training step')
      deduper.train()

    # ## Blocking

    logging.info('blocking...')

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.match(data_1, data_2, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    write_function = csvhelpers.writeLinkedResults
    # write out our results

    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       output_file,
                       self.inner_join)
    else :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       sys.stdout,
                       self.inner_join)