def activeLearning(candidates, data_model, labelPairFunction, training_data, training_pairs=None): """ Ask the user to label the record pair we are most uncertain of. Train the data model, and update our uncertainty. Repeat until user tells us she is finished. """ fields = [field for field in data_model['fields'] if data_model['fields'][field]['type'] != 'Missing Data'] duplicates = [] nonduplicates = [] if training_pairs: nonduplicates.extend(training_pairs[0]) duplicates.extend(training_pairs[1]) finished = False import time t_train = time.time() field_distances = core.fieldDistances(candidates, data_model) logging.info('calculated fieldDistances in %s seconds', str(time.time() - t_train)) seen_indices = set() while finished == False: logging.info('finding the next uncertain pair ...') uncertain_indices = findUncertainPairs(field_distances, data_model) for uncertain_index in uncertain_indices: if uncertain_index not in seen_indices: seen_indices.add(uncertain_index) break uncertain_pairs = [(candidates[uncertain_index][0][1], candidates[uncertain_index][1][1])] (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields) nonduplicates.extend(labeled_pairs[0]) duplicates.extend(labeled_pairs[1]) training_data = addTrainingData(labeled_pairs, data_model, training_data) if len(training_data) > 0: data_model = core.trainModel(training_data, data_model, 1) else: raise ValueError('No training pairs given') training_pairs = {0: nonduplicates, 1: duplicates} return (training_data, training_pairs, data_model)
def distinctPairs() : data_slice = data_sample[0:sample_size] pair_distance = core.fieldDistances(data_slice, data_model) scores = core.scorePairs(pair_distance, data_model) sample_n = 0 for score, pair in zip(scores, data_sample) : if score < confidence : yield pair sample_n += 1 if sample_n < sample_size and len(data_sample) > sample_size : for pair in data_sample[sample_size:] : pair_distance = core.fieldDistances([pair], data_model) score = core.scorePairs(pair_distance, data_model) if score < confidence : yield (pair)
def distinctPairs(): data_slice = data_sample[0:sample_size] pair_distance = core.fieldDistances(data_slice, data_model) scores = core.scorePairs(pair_distance, data_model) sample_n = 0 for score, pair in zip(scores, data_sample): if score < confidence: yield pair sample_n += 1 if sample_n < sample_size and len(data_sample) > sample_size: for pair in data_sample[sample_size:]: pair_distance = core.fieldDistances([pair], data_model) score = core.scorePairs(pair_distance, data_model) if score < confidence: yield (pair)
def addTrainingData(labeled_pairs, data_model, training_data=[]): """ Appends training data to the training data collection. """ fields = data_model['fields'] examples = [record_pair for example in labeled_pairs.values() for record_pair in example] new_training_data = numpy.empty(len(examples), dtype=training_data.dtype) new_training_data['label'] = [0] * len(labeled_pairs[0]) + [1] * len(labeled_pairs[1]) new_training_data['distances'] = core.fieldDistances(examples, data_model) training_data = numpy.append(training_data, new_training_data) return training_data
def semiSupervisedNonDuplicates(data_sample, data_model, nonduplicate_confidence_threshold=.7, sample_size=2000): if len(data_sample) <= sample_size: return data_sample confident_distinct_pairs = [] n_distinct_pairs = 0 for pair in data_sample: pair_distance = core.fieldDistances([pair], data_model) score = core.scorePairs(pair_distance, data_model) if score < 1 - nonduplicate_confidence_threshold: (key_pair, value_pair) = zip(*pair) confident_distinct_pairs.append(value_pair) n_distinct_pairs += 1 if n_distinct_pairs == sample_size: return confident_distinct_pairs
def addTrainingData(labeled_pairs, data_model, training_data=[]): """ Appends training data to the training data collection. """ fields = data_model['fields'] examples = [ record_pair for example in labeled_pairs.values() for record_pair in example ] new_training_data = numpy.empty(len(examples), dtype=training_data.dtype) new_training_data['label'] = [0] * len(labeled_pairs[0]) + [1] * len( labeled_pairs[1]) new_training_data['distances'] = core.fieldDistances(examples, data_model) training_data = numpy.append(training_data, new_training_data) return training_data
def __init__(self, candidates, data_model) : self.candidates = candidates self.field_distances = core.fieldDistances(self.candidates, data_model) self.seen_indices = set()
def __init__(self, candidates, data_model): self.candidates = candidates self.field_distances = core.fieldDistances(candidates, data_model) self.seen_indices = set()
def activeLearning(candidates, data_model, labelPairFunction, training_data, training_pairs=None): """ Ask the user to label the record pair we are most uncertain of. Train the data model, and update our uncertainty. Repeat until user tells us she is finished. """ fields = [ field for field in data_model['fields'] if data_model['fields'][field]['type'] not in ('Missing Data', 'Interaction', 'Higher Categories') ] duplicates = [] nonduplicates = [] if training_pairs: nonduplicates.extend(training_pairs[0]) duplicates.extend(training_pairs[1]) if training_data.shape[0] == 0: rand_int = random.randint(0, len(candidates)) exact_match = candidates[rand_int] training_data = addTrainingData({ 1: [exact_match] * 2, 0: [] }, data_model, training_data) data_model = core.trainModel(training_data, data_model, .1) finished = False import time t_train = time.time() field_distances = core.fieldDistances(candidates, data_model) logging.info('calculated fieldDistances in %s seconds', str(time.time() - t_train)) seen_indices = set() while finished == False: logging.info('finding the next uncertain pair ...') uncertain_indices = findUncertainPairs(field_distances, data_model, (len(duplicates) / (len(nonduplicates) + 1.0))) for uncertain_index in uncertain_indices: if uncertain_index not in seen_indices: seen_indices.add(uncertain_index) break uncertain_pairs = [candidates[uncertain_index]] (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields) nonduplicates.extend(labeled_pairs[0]) duplicates.extend(labeled_pairs[1]) training_data = addTrainingData(labeled_pairs, data_model, training_data) if len(training_data) > 0: data_model = core.trainModel(training_data, data_model, .1) else: raise ValueError('No training pairs given') training_pairs = {0: nonduplicates, 1: duplicates} return (training_data, training_pairs, data_model)
def activeLearning(candidates, data_model, labelPairFunction, training_data, training_pairs=None): """ Ask the user to label the record pair we are most uncertain of. Train the data model, and update our uncertainty. Repeat until user tells us she is finished. """ fields = [field for field in data_model['fields'] if data_model['fields'][field]['type'] not in ('Missing Data', 'Interaction', 'Higher Categories')] duplicates = [] nonduplicates = [] if training_pairs: nonduplicates.extend(training_pairs[0]) duplicates.extend(training_pairs[1]) if training_data.shape[0] == 0 : rand_int = random.randint(0, len(candidates)) exact_match = candidates[rand_int] training_data = addTrainingData({1:[exact_match]*2, 0:[]}, data_model, training_data) data_model = core.trainModel(training_data, data_model, .1) finished = False import time t_train = time.time() field_distances = core.fieldDistances(candidates, data_model) logging.info('calculated fieldDistances in %s seconds', str(time.time() - t_train)) seen_indices = set() while finished == False: logging.info('finding the next uncertain pair ...') uncertain_indices = findUncertainPairs(field_distances, data_model, (len(duplicates)/ (len(nonduplicates)+1.0))) for uncertain_index in uncertain_indices: if uncertain_index not in seen_indices: seen_indices.add(uncertain_index) break uncertain_pairs = [candidates[uncertain_index]] (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields) nonduplicates.extend(labeled_pairs[0]) duplicates.extend(labeled_pairs[1]) training_data = addTrainingData(labeled_pairs, data_model, training_data) if len(training_data) > 0: data_model = core.trainModel(training_data, data_model, .1) else: raise ValueError('No training pairs given') training_pairs = {0: nonduplicates, 1: duplicates} return (training_data, training_pairs, data_model)