Ejemplo n.º 1
0
def activeLearning(data_d, data_model, labelPairFunction, num_questions) :
  training_data = []
  duplicates = []
  nonduplicates = []
  num_iterations = 100
  pairs = blocking.allCandidates(data_d)
  record_distances = core.recordDistances(pairs, data_d, data_model)
  for _ in range(num_questions) :
    print "finding the next uncertain pair ..."
    uncertain_indices = findUncertainPairs(record_distances, data_model)
    record_distances = record_distances[: , uncertain_indices]

    uncertain_pairs = record_distances['pairs'][0:1]
    record_distances = record_distances[1:]

    labeled_pairs = labelPairFunction(uncertain_pairs, data_d, data_model)

    nonduplicates.extend(labeled_pairs[0])
    duplicates.extend(labeled_pairs[1])
    
    training_data = addTrainingData(labeled_pairs, training_data, data_model)

    data_model = core.trainModel(training_data, num_iterations, data_model)

  training_pairs = {0 : nonduplicates, 1 : duplicates}  
  
  return(training_data, training_pairs, data_model)
Ejemplo n.º 2
0
def activeLearning(data_d,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs = None,
                   key_groups = []
                   ):

    duplicates = []
    nonduplicates = []

    if training_pairs :
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])

    finished = False
    candidates = blocking.allCandidates(data_d, key_groups)

    import time
    t_train = time.time()
    record_distances = core.recordDistances(candidates, data_model)
    print 'calculated recordDistances in ', time.time() - t_train, 'seconds'
    
    while finished == False :
        print 'finding the next uncertain pair ...'
        uncertain_indices = findUncertainPairs(record_distances,
                                               data_model)

    # pop the next most uncertain pair off of record distances

        record_distances = record_distances[:, uncertain_indices]
        uncertain_pair_ids = (record_distances['pairs'])[0:1]
        record_distances = record_distances[1:]

        uncertain_pairs = []
        for pair in uncertain_pair_ids :
            record_pair = [data_d[instance] for instance in pair]
            record_pair = tuple(record_pair)
            uncertain_pairs.append(record_pair)

        labeled_pairs, finished = labelPairFunction(uncertain_pairs,
                                          data_model)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs,
                                        data_model,
                                        training_data)
        if len(training_data) > 0 :
            data_model = core.trainModel(training_data, data_model, 1)
        else :
            raise ValueError("No training pairs given")

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)