Example #1
0
    def sample(self, data, sample_size=15000, blocked_proportion=0.5):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names

        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)
        self.sampled_records = Sample(data, 900)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(
            blocked_sample_size, predicates, data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(
            core.randomPairs(len(data), random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2 in blocked_sample_keys | random_sample_keys]

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Example #2
0
    def sample(self, data, sample_size=15000, 
               blocked_proportion=0.5) :
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample
        
        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names
        
        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        frozen_values = itertools.imap(dedupe.frozendict, data.itervalues())
        
        data = dict(itertools.izip(itertools.count(), frozen_values))

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [pred for pred in predicateGenerator(self.data_model)
                      if pred.type == 'SimplePredicate']
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(dedupe.core.randomPairs(len(data),
                                                         random_sample_size))

        data_sample = [(data[k1], data[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys]

        self._loadSample(data_sample)
Example #3
0
    def sample(self, data, sample_size=15000,
               blocked_proportion=0.5):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names

        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)
        self.sampled_records = Sample(data, 900)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(core.randomPairs(len(data),
                                                  random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2
                       in blocked_sample_keys | random_sample_keys]

        self._loadSample(data_sample)
Example #4
0
    def sample(self, data, blocked_proportion, sample_size):
        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(
            blocked_sample_size, predicates, data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(
            core.randomPairs(len(data), random_sample_size))
        data = dict(data)

        return [(data[k1], data[k2])
                for k1, k2 in blocked_sample_keys | random_sample_keys]
Example #5
0
    def sample(self, data, blocked_proportion, sample_size):
        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(core.randomPairs(len(data),
                                                  random_sample_size))
        data = dict(data)

        return [(data[k1], data[k2])
                for k1, k2
                in blocked_sample_keys | random_sample_keys]
Example #6
0
File: api.py Project: lminer/dedupe
    def sample(self, data, sample_size=15000, 
               blocked_proportion=0.5) :
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample
        
        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names
        
        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(predicateGenerator(self.data_model, 
                                             index_predicates=False,
                                             canopies=self.canopies))


        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(dedupe.core.randomPairs(len(data),
                                                         random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys]

        data_sample = core.freezeData(data_sample)

        # data can be a very large object, so we'll free it up as soon
        # as possible
        del data

        self._loadSample(data_sample)