Python readData Examples

Programming Language: Python

Namespace/Package Name: csvhelpers

Method/Function: readData

Examples at hotexamples.com: 5

Python readData - 5 examples found. These are the top rated real world Python examples of csvhelpers.readData extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: csvdedupe.py Project: PereiraM/csvdedupe

  def main(self) :

    data_d = {}
    # import the specified CSV file

    data_d = csvhelpers.readData(self.input, self.field_names)
    
    logging.info('imported %d rows', len(data_d))

    # sanity check for provided field names in CSV file
    for field in self.field_definition :
      if self.field_definition[field]['type'] != 'Interaction' :
        if not field in data_d[0]:
        
          raise parser.error("Could not find field '" + field + "' in input")

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    data_sample = dedupe.dataSample(data_d, self.sample_size)

    logging.info('using fields: %s' % self.field_definition.keys())
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(self.field_definition)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      deduper.train(data_sample, str(self.training_file))
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      deduper.train(data_sample, labeler.label)

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      deduper.writeTraining(self.training_file)
    else:
      logging.info('skipping the training step')

    # ## Blocking

    logging.info('blocking...')
    # Initialize our blocker. We'll learn our blocking rules if we haven't
    # loaded them from a saved settings file.
    blocker = deduper.blockingFunction()

    # Load all the original data in to memory and place
    # them in to blocks. Each record can be blocked in many ways, so for
    # larger data, memory will be a limiting factor.

    blocked_data = dedupe.blockData(data_d, blocker)

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.goodThreshold(blocked_data, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.duplicateClusters(blocked_data, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    # write out our results
    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        csvhelpers.writeResults(clustered_dupes, self.input, output_file)
    else :
        csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)

Example #2

Show file

File: csvlink.py Project: ajschumacher/csvdedupe

  def main(self) :

    data_1 = {}
    data_2 = {}
    # import the specified CSV file

    data_1 = csvhelpers.readData(self.input_1, 
                                 self.field_names_1,
                                 prefix='input_1')
    data_2 = csvhelpers.readData(self.input_2, 
                                 self.field_names_2,
                                 prefix='input_2')

    # sanity check for provided field names in CSV file
    for field in self.field_names_1 :
      if field not in data_1.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")

    for field in self.field_names_2 :
      if field not in data_2.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")


    if self.field_names_1 != self.field_names_2 :
      for record_id, record in data_2.items() :
        remapped_record = {}
        for new_field, old_field in zip(self.field_names_1, self.field_names_2) :
          remapped_record[new_field] = record[old_field]
        data_2[record_id] = remapped_record
    
    logging.info('imported %d rows from file 1', len(data_1))
    logging.info('imported %d rows from file 2', len(data_2))

    logging.info('using fields: %s' % [field['field'] for 
                                       field in self.field_definition])
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.RecordLink(self.field_definition)

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    deduper.sample(data_1, data_2, self.sample_size)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      with open(self.training_file) as tf :
        deduper.readTraining(tf)
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      csvhelpers.consoleLabel(deduper)

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      with open(self.training_file, 'w') as tf :
        deduper.writeTraining(tf)
    else:
      logging.info('skipping the training step')

    deduper.train()

    # ## Blocking

    logging.info('blocking...')

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.match(data_1, data_2, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    write_function = csvhelpers.writeLinkedResults
    # write out our results

    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       output_file,
                       self.inner_join)
    else :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       sys.stdout,
                       self.inner_join)

Example #3

Show file

    def main(self):

        data_d = {}
        # import the specified CSV file

        timer.elapsed('Starting file read: ')
        data_d = csvhelpers.readData(self.input, self.field_names)
        timer.elapsed('Finished file read: ')

        logging.info('imported %d rows', len(data_d))

        # sanity check for provided field names in CSV file
        for field in self.field_definition:
            if field['type'] != 'Interaction':
                if not field['field'] in data_d[0]:

                    raise self.parser.error("Could not find field '" +
                                            field['field'] + "' in input")

        logging.info('using fields: %s' %
                     [field['field'] for field in self.field_definition])

        # If --skip_training has been selected, and we have a settings cache still
        # persisting from the last run, use it in this next run.
        # __Note:__ if you want to add more training data, don't use skip training
        if self.skip_training and os.path.exists(self.settings_file):

            # Load our deduper from the last training session cache.
            logging.info('reading from previous training cache %s' %
                         self.settings_file)
            with open(self.settings_file, 'rb') as f:
                deduper = dedupe.StaticDedupe(f)

            fields = {
                variable.field
                for variable in deduper.data_model.primary_fields
            }
            unique_d, parents = exact_matches(data_d, fields)

        else:
            # # Create a new deduper object and pass our data model to it.
            deduper = dedupe.Dedupe(self.field_definition)

            fields = {
                variable.field
                for variable in deduper.data_model.primary_fields
            }
            unique_d, parents = exact_matches(data_d, fields)

            # Set up our data sample
            logging.info('taking a sample of %d possible pairs',
                         self.sample_size)
            deduper.sample(unique_d, self.sample_size)

            # Perform standard training procedures
            self.dedupe_training(deduper)

        # ## Blocking

        logging.info('blocking...')

        # ## Clustering

        # Find the threshold that will maximize a weighted average of our precision and recall.
        # When we set the recall weight to 2, we are saying we care twice as much
        # about recall as we do precision.
        #
        # If we had more data, we would not pass in all the blocked data into
        # this function but a representative sample.

        logging.info('finding a good threshold with a recall_weight of %s' %
                     self.recall_weight)
        threshold = deduper.threshold(unique_d,
                                      recall_weight=self.recall_weight)

        # `duplicateClusters` will return sets of record IDs that dedupe
        # believes are all referring to the same entity.

        logging.info('clustering...')
        clustered_dupes = deduper.match(unique_d, threshold)

        expanded_clustered_dupes = []
        for cluster, scores in clustered_dupes:
            new_cluster = list(cluster)
            new_scores = list(scores)
            for row_id, score in zip(cluster, scores):
                children = parents.get(row_id, [])
                new_cluster.extend(children)
                new_scores.extend([score] * len(children))
            expanded_clustered_dupes.append((new_cluster, new_scores))

        clustered_dupes = expanded_clustered_dupes

        logging.info('# duplicate sets %s' % len(clustered_dupes))

        write_function = csvhelpers.writeResults
        # write out our results
        if self.destructive:
            write_function = csvhelpers.writeUniqueResults

        if self.output_file:
            with open(self.output_file, 'w', encoding='utf-8') as output_file:
                write_function(clustered_dupes, self.input, output_file)
        else:
            if sys.version < '3':
                out = codecs.getwriter(locale.getpreferredencoding())(
                    sys.stdout)
                write_function(clustered_dupes, self.input, out)
            else:
                write_function(clustered_dupes, self.input, sys.stdout)

Example #4

Show file

File: csvdedupe.py Project: jeremyjbowers/csvdedupe

    def main(self):

        data_d = {}
        # import the specified CSV file

        data_d = csvhelpers.readData(self.input, self.field_names)

        logging.info('imported %d rows', len(data_d))

        # sanity check for provided field names in CSV file
        for field in self.field_definition:
            if self.field_definition[field]['type'] != 'Interaction':
                if not field in data_d[0]:

                    raise parser.error("Could not find field '" + field +
                                       "' in input")

        # Set up our data sample
        logging.info('taking a sample of %d possible pairs', self.sample_size)
        data_sample = dedupe.dataSample(data_d, self.sample_size)

        logging.info('using fields: %s' % self.field_definition.keys())
        # # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(self.field_definition)

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file

        if os.path.exists(self.training_file):
            logging.info('reading labeled examples from %s' %
                         self.training_file)
            deduper.train(data_sample, str(self.training_file))
        elif self.skip_training:
            raise parser.error(
                "You need to provide an existing training_file or run this script without --skip_training"
            )

        if not self.skip_training:
            logging.info('starting active labeling...')

            deduper.train(data_sample, labeler.label)

            # When finished, save our training away to disk
            logging.info('saving training data to %s' % self.training_file)
            deduper.writeTraining(self.training_file)
        else:
            logging.info('skipping the training step')

        # ## Blocking

        logging.info('blocking...')
        # Initialize our blocker. We'll learn our blocking rules if we haven't
        # loaded them from a saved settings file.
        blocker = deduper.blockingFunction()

        # Load all the original data in to memory and place
        # them in to blocks. Each record can be blocked in many ways, so for
        # larger data, memory will be a limiting factor.

        blocked_data = dedupe.blockData(data_d, blocker)

        # ## Clustering

        # Find the threshold that will maximize a weighted average of our precision and recall.
        # When we set the recall weight to 2, we are saying we care twice as much
        # about recall as we do precision.
        #
        # If we had more data, we would not pass in all the blocked data into
        # this function but a representative sample.

        logging.info('finding a good threshold with a recall_weight of %s' %
                     self.recall_weight)
        threshold = deduper.goodThreshold(blocked_data,
                                          recall_weight=self.recall_weight)

        # `duplicateClusters` will return sets of record IDs that dedupe
        # believes are all referring to the same entity.

        logging.info('clustering...')
        clustered_dupes = deduper.duplicateClusters(blocked_data, threshold)

        logging.info('# duplicate sets %s' % len(clustered_dupes))

        # write out our results
        if self.output_file:
            with open(self.output_file, 'w') as output_file:
                csvhelpers.writeResults(clustered_dupes, self.input,
                                        output_file)
        else:
            csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)

Example #5

Show file

  def main(self) :

    data_1 = {}
    data_2 = {}
    # import the specified CSV file

    data_1 = csvhelpers.readData(self.input_1, 
                                 self.field_names_1,
                                 prefix='input_1')
    data_2 = csvhelpers.readData(self.input_2, 
                                 self.field_names_2,
                                 prefix='input_2')

    # sanity check for provided field names in CSV file
    for field in self.field_names_1 :
      if field not in data_1.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")

    for field in self.field_names_2 :
      if field not in data_2.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")


    if self.field_names_1 != self.field_names_2 :
      for record_id, record in data_2.items() :
        remapped_record = {}
        for new_field, old_field in zip(self.field_names_1, self.field_names_2) :
          remapped_record[new_field] = record[old_field]
        data_2[record_id] = remapped_record
    
    logging.info('imported %d rows from file 1', len(data_1))
    logging.info('imported %d rows from file 2', len(data_2))

    logging.info('using fields: %s' % self.field_definition.keys())
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.RecordLink(self.field_definition)

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    deduper.sample(data_1, data_2, self.sample_size)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      deduper.readTraining(self.training_file)
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      dedupe.consoleLabel(deduper)
      deduper.train()

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      deduper.writeTraining(self.training_file)
    else:
      logging.info('skipping the training step')
      deduper.train()

    # ## Blocking

    logging.info('blocking...')

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.match(data_1, data_2, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    write_function = csvhelpers.writeLinkedResults
    # write out our results

    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       output_file,
                       self.inner_join)
    else :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       sys.stdout,
                       self.inner_join)