def activeLearning(data_d, data_model, labelPairFunction, num_questions) :
  training_data = []
  duplicates = []
  nonduplicates = []
  num_iterations = 100
  pairs = blocking.allCandidates(data_d)
  record_distances = core.recordDistances(pairs, data_d, data_model)
  for _ in range(num_questions) :
    print "finding the next uncertain pair ..."
    uncertain_indices = findUncertainPairs(record_distances, data_model)
    record_distances = record_distances[: , uncertain_indices]

    uncertain_pairs = record_distances['pairs'][0:1]
    record_distances = record_distances[1:]

    labeled_pairs = labelPairFunction(uncertain_pairs, data_d, data_model)

    nonduplicates.extend(labeled_pairs[0])
    duplicates.extend(labeled_pairs[1])
    
    training_data = addTrainingData(labeled_pairs, training_data, data_model)

    data_model = core.trainModel(training_data, num_iterations, data_model)

  training_pairs = {0 : nonduplicates, 1 : duplicates}  
  
  return(training_data, training_pairs, data_model)
Esempio n. 2
0
def semiSupervisedNonDuplicates(data_d, data_model, 
                                nonduplicate_confidence_threshold=.7,
                                sample_size = 2000):


    pair_combinations = list(combinations(data_d.iteritems(), 2))

    if len(pair_combinations) <= sample_size :
        return pair_combinations

    shuffle(pair_combinations)
    
    confident_distinct_pairs = []
    n_distinct_pairs = 0
    for pair in pair_combinations :

        pair_distance = core.recordDistances([pair], data_model)
        score = core.scorePairs(pair_distance, data_model)


        if score < (1 - nonduplicate_confidence_threshold):
            key_pair, value_pair = zip(*pair)
            confident_distinct_pairs.append(value_pair)
            n_distinct_pairs += 1
            if n_distinct_pairs == sample_size :
                return confident_distinct_pairs
Esempio n. 3
0
def activeLearning(data_d,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs = None,
                   key_groups = []
                   ):

    duplicates = []
    nonduplicates = []

    if training_pairs :
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])

    finished = False
    candidates = blocking.allCandidates(data_d, key_groups)

    import time
    t_train = time.time()
    record_distances = core.recordDistances(candidates, data_model)
    print 'calculated recordDistances in ', time.time() - t_train, 'seconds'
    
    while finished == False :
        print 'finding the next uncertain pair ...'
        uncertain_indices = findUncertainPairs(record_distances,
                                               data_model)

    # pop the next most uncertain pair off of record distances

        record_distances = record_distances[:, uncertain_indices]
        uncertain_pair_ids = (record_distances['pairs'])[0:1]
        record_distances = record_distances[1:]

        uncertain_pairs = []
        for pair in uncertain_pair_ids :
            record_pair = [data_d[instance] for instance in pair]
            record_pair = tuple(record_pair)
            uncertain_pairs.append(record_pair)

        labeled_pairs, finished = labelPairFunction(uncertain_pairs,
                                          data_model)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs,
                                        data_model,
                                        training_data)
        if len(training_data) > 0 :
            data_model = core.trainModel(training_data, data_model, 1)
        else :
            raise ValueError("No training pairs given")

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)
Esempio n. 4
0
def semiSupervisedNonDuplicates(data_d, data_model,
                                nonduplicate_confidence_threshold=.7):

    # this is an expensive call and we're making it multiple times
    pairs = allCandidates(data_d)
    record_distances = core.recordDistances(pairs, data_d, data_model)

    confident_nondupes_ids = []
    scored_pairs = core.scorePairs(record_distances, data_model)

    for (i, score) in enumerate(scored_pairs):
        if score < 1 - nonduplicate_confidence_threshold:
            confident_nondupes_ids.append(record_distances['pairs'][i])

    confident_nondupes_pairs = [(data_d[pair[0]], data_d[pair[1]])
                                for pair in
                                confident_nondupes_ids]

    return confident_nondupes_pairs
Esempio n. 5
0
    def goodThreshold(self, blocks, recall_weight=1.5):
        """
        Returns the threshold that maximizes the expected F score,
        a weighted average of precision and recall for a sample of
        blocked data. 

        Keyword arguments:
        blocks --        Sequence of tuples of records, where each
                         tuple is a set of records covered by a blocking
                         predicate

        recall_weight -- Sets the tradeoff between precision and
                         recall. I.e. if you care twice as much about
                         recall as you do precision, set recall_weight
                         to 2.
        """

        candidates = (pair for block in blocks for pair in itertools.combinations(block, 2))

        record_distances = core.recordDistances(candidates, self.data_model)
        probability = core.scorePairs(record_distances, self.data_model)

        probability.sort()
        probability = probability[::-1]

        expected_dupes = numpy.cumsum(probability)

        recall = expected_dupes / expected_dupes[-1]
        precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1)

        score = recall * precision / (recall + recall_weight ** 2 * precision)

        i = numpy.argmax(score)

        logging.info("Maximum expected recall and precision")
        logging.info("recall: %2.3f" % recall[i])
        logging.info("precision: %2.3f" % precision[i])
        logging.info("With threshold: %2.3f" % probability[i])

        return probability[i]
Esempio n. 6
0
def semiSupervisedNonDuplicates(data_sample, data_model, 
                                nonduplicate_confidence_threshold=.7,
                                sample_size = 2000):



    if len(data_sample) <= sample_size :
        return data_sample

    confident_distinct_pairs = []
    n_distinct_pairs = 0
    for pair in data_sample :

        pair_distance = core.recordDistances([pair], data_model)
        score = core.scorePairs(pair_distance, data_model)


        if score < (1 - nonduplicate_confidence_threshold):
            key_pair, value_pair = zip(*pair)
            confident_distinct_pairs.append(value_pair)
            n_distinct_pairs += 1
            if n_distinct_pairs == sample_size :
                return confident_distinct_pairs
Esempio n. 7
0
def activeLearning(candidates,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs = None,
                   key_groups = []
                   ):
    """
    Ask the user to label the record pair we are most uncertain of. Train the
    data model, and update our uncertainty. Repeat until user tells us she is
    finished.
    """

    duplicates = []
    nonduplicates = []


    if training_pairs :
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])

    finished = False


    import time
    t_train = time.time()
    record_distances = core.recordDistances(candidates, data_model)
    logging.info('calculated recordDistances in %s seconds' % str(time.time() - t_train))

    seen_indices = set()
    
    while finished == False :
        logging.info('finding the next uncertain pair ...')
        uncertain_indices = findUncertainPairs(record_distances,
                                               data_model)


        for uncertain_index in uncertain_indices :
            if uncertain_index not in seen_indices :
                seen_indices.add(uncertain_index)
                break

        uncertain_pairs = [(candidates[uncertain_index][0][1],
                            candidates[uncertain_index][1][1])]

        labeled_pairs, finished = labelPairFunction(uncertain_pairs,
                                                    data_model)


        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs,
                                        data_model,
                                        training_data)
        if len(training_data) > 0 :
            data_model = core.trainModel(training_data, data_model, 1)
        else :
            raise ValueError("No training pairs given")


    training_pairs = {0: nonduplicates, 1: duplicates}



    return (training_data, training_pairs, data_model)