コード例 #1
0
ファイル: api.py プロジェクト: jtbates/dedupe
class ActiveMatching(Matching) :
    """
    Class for training dedupe extends Matching.
    
    Public methods:
    - __init__
    - readTraining
    - train
    - writeSettings
    - writeTraining
    - uncertainPairs
    - markPairs
    """

    def __init__(self, 
                 field_definition, 
                 data_sample = None,
                 num_processes = 1) :
        """
        Initialize from a data model and data sample.

        #### Example usage

            # initialize from a defined set of fields
            fields = {'Site name': {'type': 'String'},
                      'Address':   {'type': 'String'},
                      'Zip':       {'type': 'String', 'Has Missing':True},
                      'Phone':     {'type': 'String', 'Has Missing':True},
                      }

            data_sample = [
                           (
                            (854, {'city': 'san francisco',
                             'address': '300 de haro st.',
                             'name': "sally's cafe & bakery",
                             'cuisine': 'american'}),
                            (855, {'city': 'san francisco',
                             'address': '1328 18th st.',
                             'name': 'san francisco bbq',
                             'cuisine': 'thai'})
                             )
                            ]


            
            deduper = dedupe.Dedupe(fields, data_sample)

        
        #### Additional detail
        A field definition is a dictionary where the keys are the fields
        that will be used for training a model and the values are the
        field specification

        Field types include

        - String

        A 'String' type field must have as its key a name of a field
        as it appears in the data dictionary and a type declaration
        ex. `{'Phone': {type: 'String'}}`

        Longer example of a field definition:


            fields = {'name':       {'type': 'String'},
                      'address':    {'type': 'String'},
                      'city':       {'type': 'String'},
                      'cuisine':    {'type': 'String'}
                      }

        In the data_sample, each element is a tuple of two
        records. Each record is, in turn, a tuple of the record's key and
        a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.
        """
        super(ActiveMatching, self).__init__()

        if field_definition.__class__ is not dict :
            raise ValueError('Incorrect Input Type: must supply '
                             'a field definition.')

        self.data_model = DataModel(field_definition)

        self.data_sample = data_sample

        if self.data_sample :
            self._checkDataSample(self.data_sample)
            self.activeLearner = training.ActiveLearning(self.data_sample, 
                                                         self.data_model)
        else :
            self.activeLearner = None

        self.num_processes = num_processes


        training_dtype = [('label', 'S8'), 
                          ('distances', 'f4', 
                           (len(self.data_model['fields']), ))]

        self.training_data = numpy.zeros(0, dtype=training_dtype)
        self.training_pairs = dedupe.backport.OrderedDict({'distinct': [], 
                                                           'match': []})



    def readTraining(self, training_source) : # pragma : no cover
        '''
        Read training from previously saved training data file
        
        Arguments:
        
        training_source -- the path of the training data file
        '''

        LOGGER.info('reading training from file')

        with open(training_source, 'r') as f:
            training_pairs = json.load(f, 
                                       cls=serializer.dedupe_decoder)

        for (label, examples) in training_pairs.items():
            if examples :
                self._checkRecordPairType(examples[0])

            examples = core.freezeData(examples)

            training_pairs[label] = examples
            self.training_pairs[label].extend(examples)

        self._addTrainingData(training_pairs)

        self._trainClassifier()

    def train(self, ppc=1, uncovered_dupes=1) :
        """
        Keyword arguments:
        ppc -- Limits the Proportion of Pairs Covered that we allow a
               predicate to cover. If a predicate puts together a fraction
               of possible pairs greater than the ppc, that predicate will
               be removed from consideration.

               As the size of the data increases, the user will generally
               want to reduce ppc.

               ppc should be a value between 0.0 and 1.0

        uncovered_dupes -- The number of true dupes pairs in our training
                           data that we can accept will not be put into any
                           block. If true true duplicates are never in the
                           same block, we will never compare them, and may
                           never declare them to be duplicates.

                           However, requiring that we cover every single
                           true dupe pair may mean that we have to use
                           blocks that put together many, many distinct pairs
                           that we'll have to expensively, compare as well.
        """
        n_folds = min(numpy.sum(self.training_data['label']=='match')/3,
                      20)
        n_folds = max(n_folds,
                      2)

        LOGGER.info('%d folds', n_folds)

        alpha = crossvalidation.gridSearch(self.training_data,
                                           core.trainModel, 
                                           self.data_model, 
                                           k=n_folds)


        self._trainClassifier(alpha)
        self._trainBlocker(ppc, uncovered_dupes)


    def _trainClassifier(self, alpha=.1) : # pragma : no cover

        self.data_model = core.trainModel(self.training_data,
                                          self.data_model, 
                                          alpha)

        self._logLearnedWeights()

    
    def _trainBlocker(self, ppc=1, uncovered_dupes=1) :
        training_pairs = copy.deepcopy(self.training_pairs)

        blocker_types = self._blockerTypes()

        confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample,
                                                                       self.data_model,
                                                                       sample_size=32000)

        training_pairs['distinct'].extend(confident_nonduplicates)

        predicate_set = blocking.predicateGenerator(blocker_types, 
                                                    self.data_model)

        (self.predicates, 
         self.stop_words) = dedupe.blocking.blockTraining(training_pairs,
                                                          predicate_set,
                                                          ppc,
                                                          uncovered_dupes,
                                                          self._linkage_type)

        self.blocker = self._Blocker(self.predicates,
                                     self.stop_words) 


    def _blockerTypes(self) : # pragma : no cover
        string_predicates = (predicates.wholeFieldPredicate,
                             predicates.tokenFieldPredicate,
                             predicates.commonIntegerPredicate,
                             predicates.sameThreeCharStartPredicate,
                             predicates.sameFiveCharStartPredicate,
                             predicates.sameSevenCharStartPredicate,
                             predicates.nearIntegersPredicate,
                             predicates.commonFourGram,
                             predicates.commonSixGram)

        tfidf_string_predicates = tuple([tfidf.TfidfPredicate(threshold)
                                         for threshold
                                         in [0.2, 0.4, 0.6, 0.8]])

        return {'String' : (string_predicates
                            + tfidf_string_predicates)}




    def writeSettings(self, file_name): # pragma : no cover
        """
        Write a settings file that contains the 
        data model and predicates

        Keyword arguments:
        file_name -- path to file
        """

        with open(file_name, 'w') as f:
            pickle.dump(self.data_model, f)
            pickle.dump(self.predicates, f)
            pickle.dump(self.stop_words, f)

    def writeTraining(self, file_name): # pragma : no cover
        """
        Write to a json file that contains labeled examples

        Keyword arguments:
        file_name -- path to a json file
        """

        with open(file_name, 'wb') as f:
            json.dump(self.training_pairs, 
                      f, 
                      default=serializer._to_json)


    def uncertainPairs(self) :
        '''
        Provides a list of the pairs of records that dedupe is most curious to learn 
        if they are matches or distinct.
        
        Useful for user labeling.
        '''
        
        
        if self.training_data.shape[0] == 0 :
            rand_int = random.randint(0, len(self.data_sample))
            random_pair = self.data_sample[rand_int]
            exact_match = (random_pair[0], random_pair[0]) 
            self._addTrainingData({'match':[exact_match, exact_match],
                                   'distinct':[]})


            self._trainClassifier(alpha=0.1)

        
        dupe_ratio = (len(self.training_pairs['match'])
                      /(len(self.training_pairs['distinct']) + 1.0))

        return self.activeLearner.uncertainPairs(self.data_model, dupe_ratio)

    def markPairs(self, labeled_pairs) :
        '''
        Add a labeled pairs of record to dedupes training set and update the
        matching model
        
        Argument :

        labeled_pairs -- A dictionary with two keys, `match` and `distinct`
                         the values are lists that can contain pairs of records
                         
        '''
        try :
            labeled_pairs.items()
            labeled_pairs['match']
            labeled_pairs['distinct']
        except :
            raise ValueError('labeled_pairs must be a dictionary with keys '
                             '"distinct" and "match"')

        if labeled_pairs['match'] :
            pair = labeled_pairs['match'][0]
            self._checkRecordPairType(pair)
        
        if labeled_pairs['distinct'] :
            pair = labeled_pairs['distinct'][0]
            self._checkRecordPairType(pair)
        
        if not labeled_pairs['distinct'] and not labeled_pairs['match'] :
            warnings.warn("Didn't return any labeled record pairs")
        

        for label, pairs in labeled_pairs.items() :
            self.training_pairs[label].extend(core.freezeData(pairs))

        self._addTrainingData(labeled_pairs) 

        self._trainClassifier(alpha=.1)



    def _checkRecordPairType(self, record_pair) :
        try :
            record_pair[0]
        except :
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs (ordered sequences of length 2)")

        if len(record_pair) != 2 :
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs")
        try :
            record_pair[0].keys() and record_pair[1].keys()
        except :
            raise ValueError("A pair of record_pairs must be made up of two "
                             "dictionaries ")

        self._checkRecordType(record_pair[0])
        self._checkRecordType(record_pair[1])

    def  _checkDataSample(self, data_sample) :
        try :
            len(data_sample)
        except TypeError :
            raise ValueError("data_sample must be a sequence")

        if len(data_sample) :
            self._checkRecordPairType(data_sample[0])

        else :
            warnings.warn("You submitted an empty data_sample")




    def _addTrainingData(self, labeled_pairs) :
        """
        Appends training data to the training data collection.
        """
    
        for label, examples in labeled_pairs.items () :
            n_examples = len(examples)
            labels = [label] * n_examples

            new_data = numpy.empty(n_examples,
                                   dtype=self.training_data.dtype)

            new_data['label'] = labels
            new_data['distances'] = core.fieldDistances(examples, 
                                                        self.data_model)

            self.training_data = numpy.append(self.training_data, 
                                              new_data)


    def _logLearnedWeights(self): # pragma: no cover
        """
        Log learned weights and bias terms
        """
        LOGGER.info('Learned Weights')
        for (key_1, value_1) in self.data_model.items():
            try:
                for (key_2, value_2) in value_1.items():
                    LOGGER.info((key_2, value_2['weight']))
            except AttributeError:
                LOGGER.info((key_1, value_1))

    def _loadSample(self, *args, **kwargs) : # pragma : no cover

        data_sample = self._sample(*args, **kwargs)

        self._checkDataSample(data_sample) 

        self.data_sample = data_sample

        self.activeLearner = training.ActiveLearning(self.data_sample, 
                                                     self.data_model)