def sample(self, data, sample_size=15000, blocked_proportion=0.5): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) self.sampled_records = Sample(data, 900) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample( blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set( core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def sample(self, data, sample_size=15000, blocked_proportion=0.5) : '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' frozen_values = itertools.imap(dedupe.frozendict, data.itervalues()) data = dict(itertools.izip(itertools.count(), frozen_values)) blocked_sample_size = int(blocked_proportion * sample_size) predicates = [pred for pred in predicateGenerator(self.data_model) if pred.type == 'SimplePredicate'] blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(dedupe.core.randomPairs(len(data), random_sample_size)) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] self._loadSample(data_sample)
def sample(self, data, sample_size=15000, blocked_proportion=0.5): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) self.sampled_records = Sample(data, 900) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] self._loadSample(data_sample)
def sample(self, data, blocked_proportion, sample_size): blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample( blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set( core.randomPairs(len(data), random_sample_size)) data = dict(data) return [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys]
def sample(self, data, blocked_proportion, sample_size): blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(core.randomPairs(len(data), random_sample_size)) data = dict(data) return [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys]
def sample(self, data, sample_size=15000, blocked_proportion=0.5) : '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(predicateGenerator(self.data_model, index_predicates=False, canopies=self.canopies)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(dedupe.core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] data_sample = core.freezeData(data_sample) # data can be a very large object, so we'll free it up as soon # as possible del data self._loadSample(data_sample)