def sample(self, data_1, data_2, sample_size=150000, blocked_proportion=.5): ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' if len(data_1) == 0: raise ValueError( 'Dictionary of records from first dataset is empty.') elif len(data_2) == 0: raise ValueError( 'Dictionary of records from second dataset is empty.') if len(data_1) > len(data_2): data_1, data_2 = data_2, data_1 data_1 = core.index(data_1) self.sampled_records_1 = Sample(data_1, 500) offset = len(data_1) data_2 = core.index(data_2, offset) self.sampled_records_2 = Sample(data_2, 500) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) deque_1 = sampling.randomDeque(data_1) deque_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, deque_1, deque_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2), random_sample_size) random_sample_keys = {(a, b + offset) for a, b in random_sample_keys} data_sample = ((data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def markPairs(self, labeled_pairs): ''' Add a labeled pairs of record to dedupes training set and update the matching model Argument : labeled_pairs -- A dictionary with two keys, `match` and `distinct` the values are lists that can contain pairs of records ''' try: labeled_pairs.items() labeled_pairs[u'match'] labeled_pairs[u'distinct'] except: raise ValueError('labeled_pairs must be a dictionary with keys ' '"distinct" and "match"') if labeled_pairs[u'match']: pair = labeled_pairs[u'match'][0] self._checkRecordPairType(pair) if labeled_pairs[u'distinct']: pair = labeled_pairs[u'distinct'][0] self._checkRecordPairType(pair) if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match']: warnings.warn("Didn't return any labeled record pairs") for label, pairs in labeled_pairs.items(): self.training_pairs[label].extend(core.freezeData(pairs)) self._addTrainingData(labeled_pairs)
def sample(self, data, sample_size=15000, blocked_proportion=0.5): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) self.sampled_records = Sample(data, 900) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample( blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set( core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def readTraining(self, training_file): ''' Read training from previously built training data file object Arguments: training_file -- file object containing the training data ''' logger.info('reading training from file') training_pairs = json.load(training_file, cls=serializer.dedupe_decoder) if not any(training_pairs.values()): raise EmptyTrainingException( "The training file seems to contain no training examples") for (label, examples) in training_pairs.items(): if examples: self._checkRecordPairType(examples[0]) examples = core.freezeData(examples) training_pairs[label] = examples self.training_pairs[label].extend(examples) self._addTrainingData(training_pairs) self._trainClassifier()
def readTraining(self, training_source) : # pragma : no cover ''' Read training from previously saved training data file Arguments: training_source -- the path of the training data file ''' LOGGER.info('reading training from file') with open(training_source, 'r') as f: training_pairs = json.load(f, cls=serializer.dedupe_decoder) for (label, examples) in training_pairs.items(): if examples : self._checkRecordPairType(examples[0]) examples = core.freezeData(examples) training_pairs[label] = examples self.training_pairs[label].extend(examples) self._addTrainingData(training_pairs) self._trainClassifier()
def sample(self, data_1, data_2, sample_size=150000, blocked_proportion=.5): ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' if len(data_1) > len(data_2): data_1, data_2 = data_2, data_1 data_1 = core.index(data_1) offset = len(data_1) data_2 = core.index(data_2, offset) blocked_sample_size = int(blocked_proportion * sample_size) predicates = [ pred for pred in predicateGenerator(self.data_model) if pred.type == 'SimplePredicate' ] data_1 = sampling.randomDeque(data_1) data_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample( blocked_sample_size, predicates, data_1, data_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = dedupe.core.randomPairsMatch( len(data_1), len(data_2), random_sample_size) random_sample_keys = set( (a, b + offset) for a, b in random_sample_keys) data_1 = dict(data_1) data_2 = dict(data_2) data_sample = ((data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def sample(self, data_1, data_2, sample_size=150000, blocked_proportion=.5) : ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' if len(data_1) > len(data_2) : data_1, data_2 = data_2, data_1 data_1 = core.index(data_1) offset = len(data_1) data_2 = core.index(data_2, offset) blocked_sample_size = int(blocked_proportion * sample_size) predicates = [pred for pred in predicateGenerator(self.data_model) if pred.type == 'SimplePredicate'] data_1 = sampling.randomDeque(data_1) data_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, data_1, data_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = dedupe.core.randomPairsMatch(len(data_1), len(data_2), random_sample_size) random_sample_keys = set((a, b + offset) for a, b in random_sample_keys) data_1 = dict(data_1) data_2 = dict(data_2) data_sample = ((data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def _importTraining(self, training_file) : training_pairs = json.load(training_file, cls=serializer.dedupe_decoder) for (label, examples) in training_pairs.items(): if examples : self._checkRecordPairType(examples[0]) examples = core.freezeData(examples) training_pairs[label] = examples self.training_pairs[label].extend(examples) self._addTrainingData(training_pairs) self._trainClassifier()
def sample(self, data, sample_size=15000, blocked_proportion=0.5) : '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(predicateGenerator(self.data_model, index_predicates=False, canopies=self.canopies)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(dedupe.core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] data_sample = core.freezeData(data_sample) # data can be a very large object, so we'll free it up as soon # as possible del data self._loadSample(data_sample)
def sample(self, data, sample_size=15000, blocked_proportion=0.5): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) self.sampled_records = Sample(data, 900) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] data_sample = core.freezeData(data_sample) self._loadSample(data_sample)