Exemple #1
0
    def sample(self, data_1, data_2, sample_size=150000,
               blocked_proportion=.5):
        '''
        Draws a random sample of combinations of records from
        the first and second datasets, and initializes active
        learning with this sample

        Arguments:

        data_1      -- Dictionary of records from first dataset, where the
                       keys are record_ids and the values are dictionaries
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same
                       form as data_1

        sample_size -- Size of the sample to draw
        '''
        if len(data_1) == 0:
            raise ValueError(
                'Dictionary of records from first dataset is empty.')
        elif len(data_2) == 0:
            raise ValueError(
                'Dictionary of records from second dataset is empty.')

        if len(data_1) > len(data_2):
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)
        self.sampled_records_1 = Sample(data_1, 500)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)
        self.sampled_records_2 = Sample(data_2, 500)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        deque_1 = sampling.randomDeque(data_1)
        deque_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         deque_1,
                                                         deque_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = core.randomPairsMatch(len(deque_1),
                                                   len(deque_2),
                                                   random_sample_size)

        random_sample_keys = {(a, b + offset)
                              for a, b in random_sample_keys}

        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2
                       in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Exemple #2
0
    def markPairs(self, labeled_pairs):
        '''
        Add a labeled pairs of record to dedupes training set and update the
        matching model

        Argument :

        labeled_pairs -- A dictionary with two keys, `match` and `distinct`
                         the values are lists that can contain pairs of records

        '''
        try:
            labeled_pairs.items()
            labeled_pairs[u'match']
            labeled_pairs[u'distinct']
        except:
            raise ValueError('labeled_pairs must be a dictionary with keys '
                             '"distinct" and "match"')

        if labeled_pairs[u'match']:
            pair = labeled_pairs[u'match'][0]
            self._checkRecordPairType(pair)

        if labeled_pairs[u'distinct']:
            pair = labeled_pairs[u'distinct'][0]
            self._checkRecordPairType(pair)

        if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match']:
            warnings.warn("Didn't return any labeled record pairs")

        for label, pairs in labeled_pairs.items():
            self.training_pairs[label].extend(core.freezeData(pairs))

        self._addTrainingData(labeled_pairs)
Exemple #3
0
    def sample(self, data, sample_size=15000, blocked_proportion=0.5):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names

        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)
        self.sampled_records = Sample(data, 900)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(
            blocked_sample_size, predicates, data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(
            core.randomPairs(len(data), random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2 in blocked_sample_keys | random_sample_keys]

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Exemple #4
0
    def readTraining(self, training_file):
        '''
        Read training from previously built training data file object

        Arguments:

        training_file -- file object containing the training data
        '''

        logger.info('reading training from file')

        training_pairs = json.load(training_file,
                                   cls=serializer.dedupe_decoder)

        if not any(training_pairs.values()):
            raise EmptyTrainingException(
                "The training file seems to contain no training examples")

        for (label, examples) in training_pairs.items():
            if examples:
                self._checkRecordPairType(examples[0])

            examples = core.freezeData(examples)

            training_pairs[label] = examples
            self.training_pairs[label].extend(examples)

        self._addTrainingData(training_pairs)

        self._trainClassifier()
Exemple #5
0
    def readTraining(self, training_source) : # pragma : no cover
        '''
        Read training from previously saved training data file
        
        Arguments:
        
        training_source -- the path of the training data file
        '''

        LOGGER.info('reading training from file')

        with open(training_source, 'r') as f:
            training_pairs = json.load(f, 
                                       cls=serializer.dedupe_decoder)

        for (label, examples) in training_pairs.items():
            if examples :
                self._checkRecordPairType(examples[0])

            examples = core.freezeData(examples)

            training_pairs[label] = examples
            self.training_pairs[label].extend(examples)

        self._addTrainingData(training_pairs)

        self._trainClassifier()
Exemple #6
0
    def sample(self,
               data_1,
               data_2,
               sample_size=150000,
               blocked_proportion=.5):
        '''
        Draws a random sample of combinations of records from 
        the first and second datasets, and initializes active
        learning with this sample
        
        Arguments:
        
        data_1      -- Dictionary of records from first dataset, where the 
                       keys are record_ids and the values are dictionaries 
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same 
                       form as data_1
        
        sample_size -- Size of the sample to draw
        '''
        if len(data_1) > len(data_2):
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [
            pred for pred in predicateGenerator(self.data_model)
            if pred.type == 'SimplePredicate'
        ]

        data_1 = sampling.randomDeque(data_1)
        data_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(
            blocked_sample_size, predicates, data_1, data_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = dedupe.core.randomPairsMatch(
            len(data_1), len(data_2), random_sample_size)

        random_sample_keys = set(
            (a, b + offset) for a, b in random_sample_keys)

        data_1 = dict(data_1)
        data_2 = dict(data_2)

        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2 in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Exemple #7
0
    def sample(self, data_1, data_2, sample_size=150000, 
               blocked_proportion=.5) :
        '''
        Draws a random sample of combinations of records from 
        the first and second datasets, and initializes active
        learning with this sample
        
        Arguments:
        
        data_1      -- Dictionary of records from first dataset, where the 
                       keys are record_ids and the values are dictionaries 
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same 
                       form as data_1
        
        sample_size -- Size of the sample to draw
        '''
        if len(data_1) > len(data_2) :
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [pred for pred in predicateGenerator(self.data_model)
                      if pred.type == 'SimplePredicate']

        data_1 = sampling.randomDeque(data_1)
        data_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         data_1, 
                                                         data_2)
        
        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = dedupe.core.randomPairsMatch(len(data_1),
                                                          len(data_2), 
                                                          random_sample_size)

        random_sample_keys = set((a, b + offset) 
                                 for a, b in random_sample_keys)

        data_1 = dict(data_1)
        data_2 = dict(data_2)
        
        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Exemple #8
0
    def _importTraining(self, training_file) :
        training_pairs = json.load(training_file, 
                                   cls=serializer.dedupe_decoder)


        for (label, examples) in training_pairs.items():
            if examples :
                self._checkRecordPairType(examples[0])

            examples = core.freezeData(examples)

            training_pairs[label] = examples
            self.training_pairs[label].extend(examples)

        self._addTrainingData(training_pairs)

        self._trainClassifier()
Exemple #9
0
    def sample(self, data, sample_size=15000, 
               blocked_proportion=0.5) :
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample
        
        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names
        
        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(predicateGenerator(self.data_model, 
                                             index_predicates=False,
                                             canopies=self.canopies))


        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(dedupe.core.randomPairs(len(data),
                                                         random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys]

        data_sample = core.freezeData(data_sample)

        # data can be a very large object, so we'll free it up as soon
        # as possible
        del data

        self._loadSample(data_sample)
Exemple #10
0
    def sample(self, data, sample_size=15000,
               blocked_proportion=0.5):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names

        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)
        self.sampled_records = Sample(data, 900)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(core.randomPairs(len(data),
                                                  random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2
                       in blocked_sample_keys | random_sample_keys]

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)