def activeLearning(data_d, data_model, labelPairFunction, num_questions) : training_data = [] duplicates = [] nonduplicates = [] num_iterations = 100 pairs = blocking.allCandidates(data_d) record_distances = core.recordDistances(pairs, data_d, data_model) for _ in range(num_questions) : print "finding the next uncertain pair ..." uncertain_indices = findUncertainPairs(record_distances, data_model) record_distances = record_distances[: , uncertain_indices] uncertain_pairs = record_distances['pairs'][0:1] record_distances = record_distances[1:] labeled_pairs = labelPairFunction(uncertain_pairs, data_d, data_model) nonduplicates.extend(labeled_pairs[0]) duplicates.extend(labeled_pairs[1]) training_data = addTrainingData(labeled_pairs, training_data, data_model) data_model = core.trainModel(training_data, num_iterations, data_model) training_pairs = {0 : nonduplicates, 1 : duplicates} return(training_data, training_pairs, data_model)
def activeLearning(data_d, data_model, labelPairFunction, training_data, training_pairs = None, key_groups = [] ): duplicates = [] nonduplicates = [] if training_pairs : nonduplicates.extend(training_pairs[0]) duplicates.extend(training_pairs[1]) finished = False candidates = blocking.allCandidates(data_d, key_groups) import time t_train = time.time() record_distances = core.recordDistances(candidates, data_model) print 'calculated recordDistances in ', time.time() - t_train, 'seconds' while finished == False : print 'finding the next uncertain pair ...' uncertain_indices = findUncertainPairs(record_distances, data_model) # pop the next most uncertain pair off of record distances record_distances = record_distances[:, uncertain_indices] uncertain_pair_ids = (record_distances['pairs'])[0:1] record_distances = record_distances[1:] uncertain_pairs = [] for pair in uncertain_pair_ids : record_pair = [data_d[instance] for instance in pair] record_pair = tuple(record_pair) uncertain_pairs.append(record_pair) labeled_pairs, finished = labelPairFunction(uncertain_pairs, data_model) nonduplicates.extend(labeled_pairs[0]) duplicates.extend(labeled_pairs[1]) training_data = addTrainingData(labeled_pairs, data_model, training_data) if len(training_data) > 0 : data_model = core.trainModel(training_data, data_model, 1) else : raise ValueError("No training pairs given") training_pairs = {0: nonduplicates, 1: duplicates} return (training_data, training_pairs, data_model)