Example #1
0
    def __init__(self, variable_definition, data_sample=None, num_cores=None):
        """
        Initialize from a data model and data sample.

        #### Example usage

            # initialize from a defined set of fields
            fields = [{'field' : 'Site name', 'type': 'String'},
                      {'field' : 'Address', 'type': 'String'},
                      {'field' : 'Zip', 'type': 'String', 'Has Missing':True},
                      {'field' : 'Phone', 'type': 'String', 'Has Missing':True},
                     ]

            data_sample = [
                           (
                            (854, {'city': 'san francisco',
                             'address': '300 de haro st.',
                             'name': "sally's cafe & bakery",
                             'cuisine': 'american'}),
                            (855, {'city': 'san francisco',
                             'address': '1328 18th st.',
                             'name': 'san francisco bbq',
                             'cuisine': 'thai'})
                             )
                            ]


            
            deduper = dedupe.Dedupe(fields, data_sample)

        
        #### Additional detail

        A field definition is a list of dictionaries where each dictionary
        describes a variable to use for comparing records. 

        For details about variable types, check the documentation.
        <http://dedupe.readthedocs.org>`_ 

        In the data_sample, each element is a tuple of two
        records. Each record is, in turn, a tuple of the record's key and
        a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.
        """
        self.data_model = DataModel(variable_definition)

        if num_cores is None:
            self.num_cores = multiprocessing.cpu_count()
        else:
            self.num_cores = num_cores

        self.data_sample = data_sample

        if self.data_sample:
            self._checkDataSample(self.data_sample)
            self.activeLearner = training.ActiveLearning(
                self.data_sample, self.data_model, self.num_cores)
        else:
            self.data_sample = []
            self.activeLearner = None

        training_dtype = [('label', 'S8'),
                          ('distances', 'f4', (len(self.data_model), ))]

        self.training_data = numpy.zeros(0, dtype=training_dtype)
        self.training_pairs = OrderedDict({u'distinct': [], u'match': []})

        self.blocker = None
Example #2
0
class ActiveMatching(Matching):
    classifier = rlr.RegularizedLogisticRegression()
    """
    Class for training dedupe extends Matching.
    
    Public methods:
    - __init__
    - readTraining
    - train
    - writeSettings
    - writeTraining
    - uncertainPairs
    - markPairs
    - cleanupTraining
    """
    def __init__(self, variable_definition, data_sample=None, num_cores=None):
        """
        Initialize from a data model and data sample.

        #### Example usage

            # initialize from a defined set of fields
            fields = [{'field' : 'Site name', 'type': 'String'},
                      {'field' : 'Address', 'type': 'String'},
                      {'field' : 'Zip', 'type': 'String', 'Has Missing':True},
                      {'field' : 'Phone', 'type': 'String', 'Has Missing':True},
                     ]

            data_sample = [
                           (
                            (854, {'city': 'san francisco',
                             'address': '300 de haro st.',
                             'name': "sally's cafe & bakery",
                             'cuisine': 'american'}),
                            (855, {'city': 'san francisco',
                             'address': '1328 18th st.',
                             'name': 'san francisco bbq',
                             'cuisine': 'thai'})
                             )
                            ]


            
            deduper = dedupe.Dedupe(fields, data_sample)

        
        #### Additional detail

        A field definition is a list of dictionaries where each dictionary
        describes a variable to use for comparing records. 

        For details about variable types, check the documentation.
        <http://dedupe.readthedocs.org>`_ 

        In the data_sample, each element is a tuple of two
        records. Each record is, in turn, a tuple of the record's key and
        a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.
        """
        self.data_model = DataModel(variable_definition)

        if num_cores is None:
            self.num_cores = multiprocessing.cpu_count()
        else:
            self.num_cores = num_cores

        self.data_sample = data_sample

        if self.data_sample:
            self._checkDataSample(self.data_sample)
            self.activeLearner = training.ActiveLearning(
                self.data_sample, self.data_model, self.num_cores)
        else:
            self.data_sample = []
            self.activeLearner = None

        training_dtype = [('label', 'S8'),
                          ('distances', 'f4', (len(self.data_model), ))]

        self.training_data = numpy.zeros(0, dtype=training_dtype)
        self.training_pairs = OrderedDict({u'distinct': [], u'match': []})

        self.blocker = None

    def cleanupTraining(self):  # pragma : no cover
        '''
        Clean up data we used for training. Free up memory.
        '''
        del self.training_data
        del self.training_pairs
        del self.activeLearner
        del self.data_sample

    def readTraining(self, training_file):
        '''
        Read training from previously built training data file object
        
        Arguments:
        
        training_file -- file object containing the training data
        '''

        logger.info('reading training from file')

        training_pairs = json.load(training_file,
                                   cls=serializer.dedupe_decoder)

        if not any(training_pairs.values()):
            raise EmptyTrainingException(
                "The training file seems to contain no training examples")

        for (label, examples) in training_pairs.items():
            if examples:
                self._checkRecordPairType(examples[0])

            examples = core.freezeData(examples)

            training_pairs[label] = examples
            self.training_pairs[label].extend(examples)

        self._addTrainingData(training_pairs)

        self._trainClassifier()

    def train(self,
              ppc=.1,
              uncovered_dupes=None,
              index_predicates=True,
              pud=0.025):  # pragma : no cover
        """Keyword arguments:
        ppc -- Limits the Proportion of Pairs Covered that we allow a
               predicate to cover. If a predicate puts together a fraction
               of possible pairs greater than the ppc, that predicate will
               be removed from consideration.

               As the size of the data increases, the user will generally
               want to reduce ppc.

               ppc should be a value between 0.0 and 1.0

        uncovered_dupes -- The number of true dupes pairs in our training
                           data that we can accept will not be put into any
                           block. If true true duplicates are never in the
                           same block, we will never compare them, and may
                           never declare them to be duplicates.

                           However, requiring that we cover every single
                           true dupe pair may mean that we have to use
                           blocks that put together many, many distinct pairs
                           that we'll have to expensively, compare as well.

                           uncoverd_dupes is deprecated in favor of pud

        pud -- The proportion of true dupe pairs in our training data
               that that we can accept will not be put into any
               block. If true true duplicates are never in the same
               block, we will never compare them, and may never
               declare them to be duplicates.

               If both pud and uncovered_dupes are set, uncovered_dupes will
               take priority

               pud should be a float between 0.0 and 1.0

        index_predicates -- Should dedupe consider predicates that
                            rely upon indexing the data. Index predicates can 
                            be slower and take susbstantial memory.

                            Defaults to True.

        """
        if uncovered_dupes is None:
            uncovered_dupes = int(pud * len(self.training_pairs['match']))
        else:
            warnings.warn(
                "The uncovered_dupes argument of the train method will be deprecated in dedupe 1.4, please use the pud argument instead",
                DeprecationWarning)

        self._trainClassifier()
        self._trainBlocker(ppc, uncovered_dupes, index_predicates)

    def _trainClassifier(self):  # pragma : no cover
        labels = numpy.array(self.training_data['label'] == b'match',
                             dtype='i4')
        examples = self.training_data['distances']

        self.classifier.fit(examples, labels)

    def _trainBlocker(self, ppc, uncovered_dupes,
                      index_predicates):  # pragma : no cover
        training_pairs = copy.deepcopy(self.training_pairs)

        confident_nonduplicates = training.semiSupervisedNonDuplicates(
            self.data_sample,
            self.data_model,
            self.classifier,
            sample_size=32000)

        training_pairs[u'distinct'].extend(confident_nonduplicates)

        predicate_set = self.data_model.predicates(index_predicates,
                                                   self.canopies)

        self.predicates = dedupe.training.blockTraining(
            training_pairs, predicate_set, ppc, uncovered_dupes,
            self._linkage_type)

        self.blocker = blocking.Blocker(self.predicates)

    def writeSettings(self, file_obj):  # pragma : no cover
        """
        Write a settings file containing the 
        data model and predicates to a file object

        Keyword arguments:
        file_obj -- file object to write settings data into
        """

        pickle.dump(self.data_model, file_obj)
        pickle.dump(self.classifier, file_obj)
        pickle.dump(self.predicates, file_obj)

    def writeTraining(self, file_obj):  # pragma : no cover
        """
        Write to a json file that contains labeled examples

        Keyword arguments:
        file_obj -- file object to write training data to
        """

        json.dump(self.training_pairs,
                  file_obj,
                  default=serializer._to_json,
                  tuple_as_array=False,
                  ensure_ascii=True)

    def uncertainPairs(self):
        '''
        Provides a list of the pairs of records that dedupe is most
        curious to learn if they are matches or distinct.
        
        Useful for user labeling.

        '''

        if self.training_data.shape[0] == 0:
            rand_int = random.randint(0, len(self.data_sample) - 1)
            random_pair = self.data_sample[rand_int]
            exact_match = (random_pair[0], random_pair[0])
            self._addTrainingData({
                u'match': [exact_match, exact_match],
                u'distinct': [random_pair]
            })

        self._trainClassifier()

        bias = len(self.training_pairs[u'match'])
        if bias:
            bias /= (bias + len(self.training_pairs[u'distinct']))

        min_examples = min(len(self.training_pairs[u'match']),
                           len(self.training_pairs[u'distinct']))

        regularizer = 10

        bias = ((0.5 * min_examples + bias * regularizer) /
                (min_examples + regularizer))

        return self.activeLearner.uncertainPairs(self.classifier, bias)

    def markPairs(self, labeled_pairs):
        '''
        Add a labeled pairs of record to dedupes training set and update the
        matching model
        
        Argument :

        labeled_pairs -- A dictionary with two keys, `match` and `distinct`
                         the values are lists that can contain pairs of records
                         
        '''
        try:
            labeled_pairs.items()
            labeled_pairs[u'match']
            labeled_pairs[u'distinct']
        except:
            raise ValueError('labeled_pairs must be a dictionary with keys '
                             '"distinct" and "match"')

        if labeled_pairs[u'match']:
            pair = labeled_pairs[u'match'][0]
            self._checkRecordPairType(pair)

        if labeled_pairs[u'distinct']:
            pair = labeled_pairs[u'distinct'][0]
            self._checkRecordPairType(pair)

        if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match']:
            warnings.warn("Didn't return any labeled record pairs")

        for label, pairs in labeled_pairs.items():
            self.training_pairs[label].extend(core.freezeData(pairs))

        self._addTrainingData(labeled_pairs)

    def _checkRecordPairType(self, record_pair):
        try:
            record_pair[0]
        except:
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs (ordered sequences of length 2)")

        if len(record_pair) != 2:
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs")
        try:
            record_pair[0].keys() and record_pair[1].keys()
        except:
            raise ValueError("A pair of record_pairs must be made up of two "
                             "dictionaries ")

        self.data_model.check(record_pair[0])
        self.data_model.check(record_pair[1])

    def _checkDataSample(self, data_sample):
        try:
            len(data_sample)
        except TypeError:
            raise ValueError("data_sample must be a sequence")

        if len(data_sample):
            self._checkRecordPairType(data_sample[0])

        else:
            warnings.warn("You submitted an empty data_sample")

    def _addTrainingData(self, labeled_pairs):
        """
        Appends training data to the training data collection.
        """

        for label, examples in labeled_pairs.items():
            n_examples = len(examples)
            labels = [label] * n_examples

            new_data = numpy.empty(n_examples, dtype=self.training_data.dtype)

            new_data['label'] = labels
            new_data['distances'] = self.data_model.distances(examples)

            self.training_data = numpy.append(self.training_data, new_data)

    def _loadSample(self, data_sample):

        self._checkDataSample(data_sample)

        self.data_sample = data_sample

        self.activeLearner = training.ActiveLearning(self.data_sample,
                                                     self.data_model,
                                                     self.num_cores)
Example #3
0
    def __init__(self, 
                 field_definition, 
                 data_sample = None,
                 num_processes = 1) :
        """
        Initialize from a data model and data sample.

        #### Example usage

            # initialize from a defined set of fields
            fields = {'Site name': {'type': 'String'},
                      'Address':   {'type': 'String'},
                      'Zip':       {'type': 'String', 'Has Missing':True},
                      'Phone':     {'type': 'String', 'Has Missing':True},
                      }

            data_sample = [
                           (
                            (854, {'city': 'san francisco',
                             'address': '300 de haro st.',
                             'name': "sally's cafe & bakery",
                             'cuisine': 'american'}),
                            (855, {'city': 'san francisco',
                             'address': '1328 18th st.',
                             'name': 'san francisco bbq',
                             'cuisine': 'thai'})
                             )
                            ]


            
            deduper = dedupe.Dedupe(fields, data_sample)

        
        #### Additional detail
        A field definition is a dictionary where the keys are the fields
        that will be used for training a model and the values are the
        field specification

        Field types include

        - String

        A 'String' type field must have as its key a name of a field
        as it appears in the data dictionary and a type declaration
        ex. `{'Phone': {type: 'String'}}`

        Longer example of a field definition:


            fields = {'name':       {'type': 'String'},
                      'address':    {'type': 'String'},
                      'city':       {'type': 'String'},
                      'cuisine':    {'type': 'String'}
                      }

        In the data_sample, each element is a tuple of two
        records. Each record is, in turn, a tuple of the record's key and
        a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.
        """
        super(ActiveMatching, self).__init__()

        if field_definition.__class__ is not dict :
            raise ValueError('Incorrect Input Type: must supply '
                             'a field definition.')

        self.data_model = DataModel(field_definition)

        self.data_sample = data_sample

        if self.data_sample :
            self._checkDataSample(self.data_sample)
            self.activeLearner = training.ActiveLearning(self.data_sample, 
                                                         self.data_model)
        else :
            self.activeLearner = None

        self.num_processes = num_processes


        training_dtype = [('label', 'S8'), 
                          ('distances', 'f4', 
                           (len(self.data_model['fields']), ))]

        self.training_data = numpy.zeros(0, dtype=training_dtype)
        self.training_pairs = dedupe.backport.OrderedDict({'distinct': [], 
                                                           'match': []})
Example #4
0
class ActiveMatching(Matching) :
    """
    Class for training dedupe extends Matching.
    
    Public methods:
    - __init__
    - readTraining
    - train
    - writeSettings
    - writeTraining
    - uncertainPairs
    - markPairs
    """

    def __init__(self, 
                 field_definition, 
                 data_sample = None,
                 num_processes = 1) :
        """
        Initialize from a data model and data sample.

        #### Example usage

            # initialize from a defined set of fields
            fields = {'Site name': {'type': 'String'},
                      'Address':   {'type': 'String'},
                      'Zip':       {'type': 'String', 'Has Missing':True},
                      'Phone':     {'type': 'String', 'Has Missing':True},
                      }

            data_sample = [
                           (
                            (854, {'city': 'san francisco',
                             'address': '300 de haro st.',
                             'name': "sally's cafe & bakery",
                             'cuisine': 'american'}),
                            (855, {'city': 'san francisco',
                             'address': '1328 18th st.',
                             'name': 'san francisco bbq',
                             'cuisine': 'thai'})
                             )
                            ]


            
            deduper = dedupe.Dedupe(fields, data_sample)

        
        #### Additional detail
        A field definition is a dictionary where the keys are the fields
        that will be used for training a model and the values are the
        field specification

        Field types include

        - String

        A 'String' type field must have as its key a name of a field
        as it appears in the data dictionary and a type declaration
        ex. `{'Phone': {type: 'String'}}`

        Longer example of a field definition:


            fields = {'name':       {'type': 'String'},
                      'address':    {'type': 'String'},
                      'city':       {'type': 'String'},
                      'cuisine':    {'type': 'String'}
                      }

        In the data_sample, each element is a tuple of two
        records. Each record is, in turn, a tuple of the record's key and
        a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.
        """
        super(ActiveMatching, self).__init__()

        if field_definition.__class__ is not dict :
            raise ValueError('Incorrect Input Type: must supply '
                             'a field definition.')

        self.data_model = DataModel(field_definition)

        self.data_sample = data_sample

        if self.data_sample :
            self._checkDataSample(self.data_sample)
            self.activeLearner = training.ActiveLearning(self.data_sample, 
                                                         self.data_model)
        else :
            self.activeLearner = None

        self.num_processes = num_processes


        training_dtype = [('label', 'S8'), 
                          ('distances', 'f4', 
                           (len(self.data_model['fields']), ))]

        self.training_data = numpy.zeros(0, dtype=training_dtype)
        self.training_pairs = dedupe.backport.OrderedDict({'distinct': [], 
                                                           'match': []})



    def readTraining(self, training_source) : # pragma : no cover
        '''
        Read training from previously saved training data file
        
        Arguments:
        
        training_source -- the path of the training data file
        '''

        LOGGER.info('reading training from file')

        with open(training_source, 'r') as f:
            training_pairs = json.load(f, 
                                       cls=serializer.dedupe_decoder)

        for (label, examples) in training_pairs.items():
            if examples :
                self._checkRecordPairType(examples[0])

            examples = core.freezeData(examples)

            training_pairs[label] = examples
            self.training_pairs[label].extend(examples)

        self._addTrainingData(training_pairs)

        self._trainClassifier()

    def train(self, ppc=1, uncovered_dupes=1) :
        """
        Keyword arguments:
        ppc -- Limits the Proportion of Pairs Covered that we allow a
               predicate to cover. If a predicate puts together a fraction
               of possible pairs greater than the ppc, that predicate will
               be removed from consideration.

               As the size of the data increases, the user will generally
               want to reduce ppc.

               ppc should be a value between 0.0 and 1.0

        uncovered_dupes -- The number of true dupes pairs in our training
                           data that we can accept will not be put into any
                           block. If true true duplicates are never in the
                           same block, we will never compare them, and may
                           never declare them to be duplicates.

                           However, requiring that we cover every single
                           true dupe pair may mean that we have to use
                           blocks that put together many, many distinct pairs
                           that we'll have to expensively, compare as well.
        """
        n_folds = min(numpy.sum(self.training_data['label']=='match')/3,
                      20)
        n_folds = max(n_folds,
                      2)

        LOGGER.info('%d folds', n_folds)

        alpha = crossvalidation.gridSearch(self.training_data,
                                           core.trainModel, 
                                           self.data_model, 
                                           k=n_folds)


        self._trainClassifier(alpha)
        self._trainBlocker(ppc, uncovered_dupes)


    def _trainClassifier(self, alpha=.1) : # pragma : no cover

        self.data_model = core.trainModel(self.training_data,
                                          self.data_model, 
                                          alpha)

        self._logLearnedWeights()

    
    def _trainBlocker(self, ppc=1, uncovered_dupes=1) :
        training_pairs = copy.deepcopy(self.training_pairs)

        blocker_types = self._blockerTypes()

        confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample,
                                                                       self.data_model,
                                                                       sample_size=32000)

        training_pairs['distinct'].extend(confident_nonduplicates)

        predicate_set = blocking.predicateGenerator(blocker_types, 
                                                    self.data_model)

        (self.predicates, 
         self.stop_words) = dedupe.blocking.blockTraining(training_pairs,
                                                          predicate_set,
                                                          ppc,
                                                          uncovered_dupes,
                                                          self._linkage_type)

        self.blocker = self._Blocker(self.predicates,
                                     self.stop_words) 


    def _blockerTypes(self) : # pragma : no cover
        string_predicates = (predicates.wholeFieldPredicate,
                             predicates.tokenFieldPredicate,
                             predicates.commonIntegerPredicate,
                             predicates.sameThreeCharStartPredicate,
                             predicates.sameFiveCharStartPredicate,
                             predicates.sameSevenCharStartPredicate,
                             predicates.nearIntegersPredicate,
                             predicates.commonFourGram,
                             predicates.commonSixGram)

        tfidf_string_predicates = tuple([tfidf.TfidfPredicate(threshold)
                                         for threshold
                                         in [0.2, 0.4, 0.6, 0.8]])

        return {'String' : (string_predicates
                            + tfidf_string_predicates)}




    def writeSettings(self, file_name): # pragma : no cover
        """
        Write a settings file that contains the 
        data model and predicates

        Keyword arguments:
        file_name -- path to file
        """

        with open(file_name, 'w') as f:
            pickle.dump(self.data_model, f)
            pickle.dump(self.predicates, f)
            pickle.dump(self.stop_words, f)

    def writeTraining(self, file_name): # pragma : no cover
        """
        Write to a json file that contains labeled examples

        Keyword arguments:
        file_name -- path to a json file
        """

        with open(file_name, 'wb') as f:
            json.dump(self.training_pairs, 
                      f, 
                      default=serializer._to_json)


    def uncertainPairs(self) :
        '''
        Provides a list of the pairs of records that dedupe is most curious to learn 
        if they are matches or distinct.
        
        Useful for user labeling.
        '''
        
        
        if self.training_data.shape[0] == 0 :
            rand_int = random.randint(0, len(self.data_sample))
            random_pair = self.data_sample[rand_int]
            exact_match = (random_pair[0], random_pair[0]) 
            self._addTrainingData({'match':[exact_match, exact_match],
                                   'distinct':[]})


            self._trainClassifier(alpha=0.1)

        
        dupe_ratio = (len(self.training_pairs['match'])
                      /(len(self.training_pairs['distinct']) + 1.0))

        return self.activeLearner.uncertainPairs(self.data_model, dupe_ratio)

    def markPairs(self, labeled_pairs) :
        '''
        Add a labeled pairs of record to dedupes training set and update the
        matching model
        
        Argument :

        labeled_pairs -- A dictionary with two keys, `match` and `distinct`
                         the values are lists that can contain pairs of records
                         
        '''
        try :
            labeled_pairs.items()
            labeled_pairs['match']
            labeled_pairs['distinct']
        except :
            raise ValueError('labeled_pairs must be a dictionary with keys '
                             '"distinct" and "match"')

        if labeled_pairs['match'] :
            pair = labeled_pairs['match'][0]
            self._checkRecordPairType(pair)
        
        if labeled_pairs['distinct'] :
            pair = labeled_pairs['distinct'][0]
            self._checkRecordPairType(pair)
        
        if not labeled_pairs['distinct'] and not labeled_pairs['match'] :
            warnings.warn("Didn't return any labeled record pairs")
        

        for label, pairs in labeled_pairs.items() :
            self.training_pairs[label].extend(core.freezeData(pairs))

        self._addTrainingData(labeled_pairs) 

        self._trainClassifier(alpha=.1)



    def _checkRecordPairType(self, record_pair) :
        try :
            record_pair[0]
        except :
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs (ordered sequences of length 2)")

        if len(record_pair) != 2 :
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs")
        try :
            record_pair[0].keys() and record_pair[1].keys()
        except :
            raise ValueError("A pair of record_pairs must be made up of two "
                             "dictionaries ")

        self._checkRecordType(record_pair[0])
        self._checkRecordType(record_pair[1])

    def  _checkDataSample(self, data_sample) :
        try :
            len(data_sample)
        except TypeError :
            raise ValueError("data_sample must be a sequence")

        if len(data_sample) :
            self._checkRecordPairType(data_sample[0])

        else :
            warnings.warn("You submitted an empty data_sample")




    def _addTrainingData(self, labeled_pairs) :
        """
        Appends training data to the training data collection.
        """
    
        for label, examples in labeled_pairs.items () :
            n_examples = len(examples)
            labels = [label] * n_examples

            new_data = numpy.empty(n_examples,
                                   dtype=self.training_data.dtype)

            new_data['label'] = labels
            new_data['distances'] = core.fieldDistances(examples, 
                                                        self.data_model)

            self.training_data = numpy.append(self.training_data, 
                                              new_data)


    def _logLearnedWeights(self): # pragma: no cover
        """
        Log learned weights and bias terms
        """
        LOGGER.info('Learned Weights')
        for (key_1, value_1) in self.data_model.items():
            try:
                for (key_2, value_2) in value_1.items():
                    LOGGER.info((key_2, value_2['weight']))
            except AttributeError:
                LOGGER.info((key_1, value_1))

    def _loadSample(self, *args, **kwargs) : # pragma : no cover

        data_sample = self._sample(*args, **kwargs)

        self._checkDataSample(data_sample) 

        self.data_sample = data_sample

        self.activeLearner = training.ActiveLearning(self.data_sample, 
                                                     self.data_model)
Example #5
0
    def __init__(self, 
                 variable_definition, 
                 data_sample = None,
                 num_cores = None) :
        """
        Initialize from a data model and data sample.

        #### Example usage

            # initialize from a defined set of fields
            fields = [{'field' : 'Site name', 'type': 'String'},
                      {'field' : 'Address', 'type': 'String'},
                      {'field' : 'Zip', 'type': 'String', 'Has Missing':True},
                      {'field' : 'Phone', 'type': 'String', 'Has Missing':True},
                     ]

            data_sample = [
                           (
                            (854, {'city': 'san francisco',
                             'address': '300 de haro st.',
                             'name': "sally's cafe & bakery",
                             'cuisine': 'american'}),
                            (855, {'city': 'san francisco',
                             'address': '1328 18th st.',
                             'name': 'san francisco bbq',
                             'cuisine': 'thai'})
                             )
                            ]


            
            deduper = dedupe.Dedupe(fields, data_sample)

        
        #### Additional detail

        A field definition is a list of dictionaries where each dictionary
        describes a variable to use for comparing records. 

        For details about variable types, check the documentation.
        <http://dedupe.readthedocs.org>`_ 

        In the data_sample, each element is a tuple of two
        records. Each record is, in turn, a tuple of the record's key and
        a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.
        """
        self.data_model = DataModel(variable_definition)

        if num_cores is None :
            self.num_cores = multiprocessing.cpu_count()
        else :
            self.num_cores = num_cores

        self.data_sample = data_sample

        if self.data_sample :
            self._checkDataSample(self.data_sample)
            self.activeLearner = training.ActiveLearning(self.data_sample, 
                                                         self.data_model,
                                                         self.num_cores)
        else :
            self.data_sample = []
            self.activeLearner = None

        training_dtype = [('label', 'S8'), 
                          ('distances', 'f4', 
                           (len(self.data_model), ))]

        self.training_data = numpy.zeros(0, dtype=training_dtype)
        self.training_pairs = OrderedDict({u'distinct': [], 
                                           u'match': []})

        self.blocker = None
Example #6
0
class ActiveMatching(Matching) :
    classifier = rlr.RegularizedLogisticRegression()

    """
    Class for training dedupe extends Matching.
    
    Public methods:
    - __init__
    - readTraining
    - train
    - writeSettings
    - writeTraining
    - uncertainPairs
    - markPairs
    - cleanupTraining
    """

    def __init__(self, 
                 variable_definition, 
                 data_sample = None,
                 num_cores = None) :
        """
        Initialize from a data model and data sample.

        #### Example usage

            # initialize from a defined set of fields
            fields = [{'field' : 'Site name', 'type': 'String'},
                      {'field' : 'Address', 'type': 'String'},
                      {'field' : 'Zip', 'type': 'String', 'Has Missing':True},
                      {'field' : 'Phone', 'type': 'String', 'Has Missing':True},
                     ]

            data_sample = [
                           (
                            (854, {'city': 'san francisco',
                             'address': '300 de haro st.',
                             'name': "sally's cafe & bakery",
                             'cuisine': 'american'}),
                            (855, {'city': 'san francisco',
                             'address': '1328 18th st.',
                             'name': 'san francisco bbq',
                             'cuisine': 'thai'})
                             )
                            ]


            
            deduper = dedupe.Dedupe(fields, data_sample)

        
        #### Additional detail

        A field definition is a list of dictionaries where each dictionary
        describes a variable to use for comparing records. 

        For details about variable types, check the documentation.
        <http://dedupe.readthedocs.org>`_ 

        In the data_sample, each element is a tuple of two
        records. Each record is, in turn, a tuple of the record's key and
        a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.
        """
        self.data_model = DataModel(variable_definition)

        if num_cores is None :
            self.num_cores = multiprocessing.cpu_count()
        else :
            self.num_cores = num_cores

        self.data_sample = data_sample

        if self.data_sample :
            self._checkDataSample(self.data_sample)
            self.activeLearner = training.ActiveLearning(self.data_sample, 
                                                         self.data_model,
                                                         self.num_cores)
        else :
            self.data_sample = []
            self.activeLearner = None

        training_dtype = [('label', 'S8'), 
                          ('distances', 'f4', 
                           (len(self.data_model), ))]

        self.training_data = numpy.zeros(0, dtype=training_dtype)
        self.training_pairs = OrderedDict({u'distinct': [], 
                                           u'match': []})

        self.blocker = None


    def cleanupTraining(self) : # pragma : no cover
        '''
        Clean up data we used for training. Free up memory.
        '''
        del self.training_data
        del self.training_pairs
        del self.activeLearner
        del self.data_sample


    def readTraining(self, training_file) :
        '''
        Read training from previously built training data file object
        
        Arguments:
        
        training_file -- file object containing the training data
        '''
        
        logger.info('reading training from file')
        
        training_pairs = json.load(training_file, 
                                   cls=serializer.dedupe_decoder)

        if not any(training_pairs.values()) :
            raise EmptyTrainingException("The training file seems to contain no training examples")
            

        for (label, examples) in training_pairs.items():
            if examples :
                self._checkRecordPairType(examples[0])

            examples = core.freezeData(examples)

            training_pairs[label] = examples
            self.training_pairs[label].extend(examples)

        self._addTrainingData(training_pairs)

        self._trainClassifier()

    def train(self, ppc=.1, uncovered_dupes=1, index_predicates=True) : # pragma : no cover
        """Keyword arguments:
        ppc -- Limits the Proportion of Pairs Covered that we allow a
               predicate to cover. If a predicate puts together a fraction
               of possible pairs greater than the ppc, that predicate will
               be removed from consideration.

               As the size of the data increases, the user will generally
               want to reduce ppc.

               ppc should be a value between 0.0 and 1.0

        uncovered_dupes -- The number of true dupes pairs in our training
                           data that we can accept will not be put into any
                           block. If true true duplicates are never in the
                           same block, we will never compare them, and may
                           never declare them to be duplicates.

                           However, requiring that we cover every single
                           true dupe pair may mean that we have to use
                           blocks that put together many, many distinct pairs
                           that we'll have to expensively, compare as well.

        index_predicates -- Should dedupe consider predicates that
                            rely upon indexing the data. Index predicates can 
                            be slower and take susbstantial memory.

                            Defaults to True.

        """
        self._trainClassifier()
        self._trainBlocker(ppc, uncovered_dupes, index_predicates)

    def _trainClassifier(self) : # pragma : no cover
        labels = numpy.array(self.training_data['label'] == b'match', 
                             dtype='i4')
        examples = self.training_data['distances']

        self.classifier.fit(examples, labels)

    
    def _trainBlocker(self, ppc=1, uncovered_dupes=1, index_predicates=True) : # pragma : no cover
        training_pairs = copy.deepcopy(self.training_pairs)

        confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample,
                                                                       self.data_model,
                                                                       self.classifier,
                                                                       sample_size=32000)

        training_pairs[u'distinct'].extend(confident_nonduplicates)

        predicate_set = self.data_model.predicates(index_predicates,
                                                   self.canopies)

        (self.predicates, 
         self.stop_words) = dedupe.training.blockTraining(training_pairs,
                                                          predicate_set,
                                                          ppc,
                                                          uncovered_dupes,
                                                          self._linkage_type)

        self.blocker = blocking.Blocker(self.predicates,
                                        self.stop_words) 


    def writeSettings(self, file_obj): # pragma : no cover
        """
        Write a settings file containing the 
        data model and predicates to a file object

        Keyword arguments:
        file_obj -- file object to write settings data into
        """

        pickle.dump(self.data_model, file_obj)
        pickle.dump(self.classifier, file_obj)
        pickle.dump(self.predicates, file_obj)
        pickle.dump(dict(self.stop_words), file_obj)

    def writeTraining(self, file_obj): # pragma : no cover
        """
        Write to a json file that contains labeled examples

        Keyword arguments:
        file_obj -- file object to write training data to
        """

        json.dump(self.training_pairs, 
                  file_obj, 
                  default=serializer._to_json,
                  tuple_as_array=False,
                  ensure_ascii=True)


    def uncertainPairs(self) :
        '''
        Provides a list of the pairs of records that dedupe is most
        curious to learn if they are matches or distinct.
        
        Useful for user labeling.

        '''
        
        
        if self.training_data.shape[0] == 0 :
            rand_int = random.randint(0, len(self.data_sample)-1)
            random_pair = self.data_sample[rand_int]
            exact_match = (random_pair[0], random_pair[0]) 
            self._addTrainingData({u'match':[exact_match, exact_match],
                                   u'distinct':[random_pair]})


        self._trainClassifier()

        bias = len(self.training_pairs[u'match'])
        if bias :
            bias /= (bias
                     + len(self.training_pairs[u'distinct']))

        min_examples = min(len(self.training_pairs[u'match']),
                           len(self.training_pairs[u'distinct']))

        regularizer = 10 

        bias = ((0.5 * min_examples + bias * regularizer)
                /(min_examples + regularizer))

        return self.activeLearner.uncertainPairs(self.classifier, bias)

    def markPairs(self, labeled_pairs) :
        '''
        Add a labeled pairs of record to dedupes training set and update the
        matching model
        
        Argument :

        labeled_pairs -- A dictionary with two keys, `match` and `distinct`
                         the values are lists that can contain pairs of records
                         
        '''
        try :
            labeled_pairs.items()
            labeled_pairs[u'match']
            labeled_pairs[u'distinct']
        except :
            raise ValueError('labeled_pairs must be a dictionary with keys '
                             '"distinct" and "match"')

        if labeled_pairs[u'match'] :
            pair = labeled_pairs[u'match'][0]
            self._checkRecordPairType(pair)
        
        if labeled_pairs[u'distinct'] :
            pair = labeled_pairs[u'distinct'][0]
            self._checkRecordPairType(pair)
        
        if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match'] :
            warnings.warn("Didn't return any labeled record pairs")
        

        for label, pairs in labeled_pairs.items() :
            self.training_pairs[label].extend(core.freezeData(pairs))

        self._addTrainingData(labeled_pairs) 



    def _checkRecordPairType(self, record_pair) :
        try :
            record_pair[0]
        except :
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs (ordered sequences of length 2)")

        if len(record_pair) != 2 :
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs")
        try :
            record_pair[0].keys() and record_pair[1].keys()
        except :
            raise ValueError("A pair of record_pairs must be made up of two "
                             "dictionaries ")

        self.data_model.check(record_pair[0])
        self.data_model.check(record_pair[1])

    def  _checkDataSample(self, data_sample) :
        try :
            len(data_sample)
        except TypeError :
            raise ValueError("data_sample must be a sequence")

        if len(data_sample) :
            self._checkRecordPairType(data_sample[0])

        else :
            warnings.warn("You submitted an empty data_sample")




    def _addTrainingData(self, labeled_pairs) :
        """
        Appends training data to the training data collection.
        """
    
        for label, examples in labeled_pairs.items () :
            n_examples = len(examples)
            labels = [label] * n_examples

            new_data = numpy.empty(n_examples,
                                   dtype=self.training_data.dtype)

            new_data['label'] = labels
            new_data['distances'] = self.data_model.distances(examples)

            self.training_data = numpy.append(self.training_data, 
                                              new_data)


    def _loadSample(self, data_sample) :

        self._checkDataSample(data_sample) 

        self.data_sample = data_sample

        self.activeLearner = training.ActiveLearning(self.data_sample, 
                                                     self.data_model,
                                                     self.num_cores)