def trainingDataDedupe(data, common_key, training_size=50000) : # pragma : nocover ''' Construct training data for consumption by the ActiveLearning markPairs method from an already deduplicated dataset. Arguments : data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names common_key -- The name of the record field that uniquely identifies a match training_size -- the rough limit of the number of training examples, defaults to 50000 Warning: Every match must be identified by the sharing of a common key. This function assumes that if two records do not share a common key then they are distinct records. ''' identified_records = collections.defaultdict(list) matched_pairs = set() distinct_pairs = set() unique_record_ids = set() # a list of record_ids associated with each common_key for record_id, record in data.items() : unique_record_ids.add(record_id) identified_records[record[common_key]].append(record_id) # all combinations of matched_pairs from each common_key group for record_ids in identified_records.values() : if len(record_ids) > 1 : matched_pairs.update(itertools.combinations(sorted(record_ids), 2)) # calculate indices using dedupe.core.randomPairs to avoid # the memory cost of enumerating all possible pairs unique_record_ids = list(unique_record_ids) pair_indices = randomPairs(len(unique_record_ids), training_size) distinct_pairs = set() for i, j in pair_indices: distinct_pairs.add((unique_record_ids[i], unique_record_ids[j])) distinct_pairs -= matched_pairs matched_records = [(data[key_1], data[key_2]) for key_1, key_2 in matched_pairs] distinct_records = [(data[key_1], data[key_2]) for key_1, key_2 in distinct_pairs] training_pairs = {'match' : matched_records, 'distinct' : distinct_records} return training_pairs
def sample(self, data, sample_size=15000, blocked_proportion=0.5): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) self.sampled_records = Sample(data, 900) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample( blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set( core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def sample(self, data, sample_size=15000, blocked_proportion=0.5): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) self.sampled_records = Sample(data, 900) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] self._loadSample(data_sample)
def sample(self, data, blocked_proportion, sample_size): blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample( blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set( core.randomPairs(len(data), random_sample_size)) data = dict(data) return [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys]
def sample(self, data, blocked_proportion, sample_size): blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(core.randomPairs(len(data), random_sample_size)) data = dict(data) return [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys]