Example #1
0
def activeLearning(candidates,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs=None):
    """
    Ask the user to label the record pair we are most uncertain of. Train the
    data model, and update our uncertainty. Repeat until user tells us she is
    finished.
    """

    fields = [field for field in data_model['fields']
              if data_model['fields'][field]['type'] != 'Missing Data']


    duplicates = []
    nonduplicates = []

    if training_pairs:
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])

    finished = False

    import time
    t_train = time.time()
    field_distances = core.fieldDistances(candidates, data_model)
    logging.info('calculated fieldDistances in %s seconds',
                 str(time.time() - t_train))

    seen_indices = set()

    while finished == False:
        logging.info('finding the next uncertain pair ...')
        uncertain_indices = findUncertainPairs(field_distances, data_model)

        for uncertain_index in uncertain_indices:
            if uncertain_index not in seen_indices:
                seen_indices.add(uncertain_index)
                break

        uncertain_pairs = [(candidates[uncertain_index][0][1],
                            candidates[uncertain_index][1][1])]

        (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs, data_model, training_data)
        if len(training_data) > 0:
            data_model = core.trainModel(training_data, data_model, 1)
        else:
            raise ValueError('No training pairs given')

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)
Example #2
0
def activeLearning(candidates,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs=None):
    """
    Ask the user to label the record pair we are most uncertain of. Train the
    data model, and update our uncertainty. Repeat until user tells us she is
    finished.
    """

    fields = [field for field in data_model['fields']
              if data_model['fields'][field]['type'] != 'Missing Data']


    duplicates = []
    nonduplicates = []

    if training_pairs:
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])

    finished = False

    import time
    t_train = time.time()
    field_distances = core.fieldDistances(candidates, data_model)
    logging.info('calculated fieldDistances in %s seconds',
                 str(time.time() - t_train))

    seen_indices = set()

    while finished == False:
        logging.info('finding the next uncertain pair ...')
        uncertain_indices = findUncertainPairs(field_distances, data_model)

        for uncertain_index in uncertain_indices:
            if uncertain_index not in seen_indices:
                seen_indices.add(uncertain_index)
                break

        uncertain_pairs = [(candidates[uncertain_index][0][1],
                            candidates[uncertain_index][1][1])]

        (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs, data_model, training_data)
        if len(training_data) > 0:
            data_model = core.trainModel(training_data, data_model, 1)
        else:
            raise ValueError('No training pairs given')

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)
Example #3
0
    def distinctPairs() :
        data_slice = data_sample[0:sample_size]
        pair_distance = core.fieldDistances(data_slice, data_model)
        scores = core.scorePairs(pair_distance, data_model)

        sample_n = 0
        for score, pair in zip(scores, data_sample) :
            if score < confidence :
                yield pair
                sample_n += 1

        if sample_n < sample_size and len(data_sample) > sample_size :
            for pair in data_sample[sample_size:] :
                pair_distance = core.fieldDistances([pair], data_model)
                score = core.scorePairs(pair_distance, data_model)
                
                if score < confidence :
                    yield (pair)
Example #4
0
    def distinctPairs():
        data_slice = data_sample[0:sample_size]
        pair_distance = core.fieldDistances(data_slice, data_model)
        scores = core.scorePairs(pair_distance, data_model)

        sample_n = 0
        for score, pair in zip(scores, data_sample):
            if score < confidence:
                yield pair
                sample_n += 1

        if sample_n < sample_size and len(data_sample) > sample_size:
            for pair in data_sample[sample_size:]:
                pair_distance = core.fieldDistances([pair], data_model)
                score = core.scorePairs(pair_distance, data_model)

                if score < confidence:
                    yield (pair)
Example #5
0
def addTrainingData(labeled_pairs, data_model, training_data=[]):
    """
    Appends training data to the training data collection.
    """

    fields = data_model['fields']

    examples = [record_pair for example in labeled_pairs.values()
                for record_pair in example]

    new_training_data = numpy.empty(len(examples),
                                    dtype=training_data.dtype)

    new_training_data['label'] = [0] * len(labeled_pairs[0]) + [1] * len(labeled_pairs[1])
    new_training_data['distances'] = core.fieldDistances(examples, data_model)


    training_data = numpy.append(training_data, new_training_data)


    return training_data
Example #6
0
def semiSupervisedNonDuplicates(data_sample,
                                data_model,
                                nonduplicate_confidence_threshold=.7,
                                sample_size=2000):

    if len(data_sample) <= sample_size:
        return data_sample

    confident_distinct_pairs = []
    n_distinct_pairs = 0
    for pair in data_sample:

        pair_distance = core.fieldDistances([pair], data_model)
        score = core.scorePairs(pair_distance, data_model)

        if score < 1 - nonduplicate_confidence_threshold:
            (key_pair, value_pair) = zip(*pair)
            confident_distinct_pairs.append(value_pair)
            n_distinct_pairs += 1
            if n_distinct_pairs == sample_size:
                return confident_distinct_pairs
Example #7
0
def addTrainingData(labeled_pairs, data_model, training_data=[]):
    """
    Appends training data to the training data collection.
    """

    fields = data_model['fields']

    examples = [
        record_pair for example in labeled_pairs.values()
        for record_pair in example
    ]

    new_training_data = numpy.empty(len(examples), dtype=training_data.dtype)

    new_training_data['label'] = [0] * len(labeled_pairs[0]) + [1] * len(
        labeled_pairs[1])
    new_training_data['distances'] = core.fieldDistances(examples, data_model)

    training_data = numpy.append(training_data, new_training_data)

    return training_data
Example #8
0
    def __init__(self, candidates, data_model) :

        self.candidates = candidates
        self.field_distances = core.fieldDistances(self.candidates, data_model)
        self.seen_indices = set()
Example #9
0
    def __init__(self, candidates, data_model):

        self.candidates = candidates
        self.field_distances = core.fieldDistances(candidates, data_model)
        self.seen_indices = set()
Example #10
0
def activeLearning(candidates,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs=None):
    """
    Ask the user to label the record pair we are most uncertain of. Train the
    data model, and update our uncertainty. Repeat until user tells us she is
    finished.
    """

    fields = [
        field for field in data_model['fields']
        if data_model['fields'][field]['type'] not in ('Missing Data',
                                                       'Interaction',
                                                       'Higher Categories')
    ]

    duplicates = []
    nonduplicates = []

    if training_pairs:
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])

    if training_data.shape[0] == 0:
        rand_int = random.randint(0, len(candidates))
        exact_match = candidates[rand_int]
        training_data = addTrainingData({
            1: [exact_match] * 2,
            0: []
        }, data_model, training_data)

    data_model = core.trainModel(training_data, data_model, .1)

    finished = False

    import time
    t_train = time.time()
    field_distances = core.fieldDistances(candidates, data_model)
    logging.info('calculated fieldDistances in %s seconds',
                 str(time.time() - t_train))

    seen_indices = set()

    while finished == False:
        logging.info('finding the next uncertain pair ...')
        uncertain_indices = findUncertainPairs(field_distances, data_model,
                                               (len(duplicates) /
                                                (len(nonduplicates) + 1.0)))

        for uncertain_index in uncertain_indices:
            if uncertain_index not in seen_indices:
                seen_indices.add(uncertain_index)
                break

        uncertain_pairs = [candidates[uncertain_index]]

        (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs, data_model,
                                        training_data)

        if len(training_data) > 0:

            data_model = core.trainModel(training_data, data_model, .1)
        else:
            raise ValueError('No training pairs given')

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)
Example #11
0
def activeLearning(candidates,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs=None):
    """
    Ask the user to label the record pair we are most uncertain of. Train the
    data model, and update our uncertainty. Repeat until user tells us she is
    finished.
    """

    fields = [field for field in data_model['fields']
              if data_model['fields'][field]['type'] not in ('Missing Data',
                                                             'Interaction',
                                                             'Higher Categories')]


    duplicates = []
    nonduplicates = []

    if training_pairs:
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])


    if training_data.shape[0] == 0 :
        rand_int = random.randint(0, len(candidates))
        exact_match = candidates[rand_int]
        training_data = addTrainingData({1:[exact_match]*2,
                                         0:[]},
                                        data_model,
                                        training_data)

    data_model = core.trainModel(training_data, data_model, .1)


    finished = False

    import time
    t_train = time.time()
    field_distances = core.fieldDistances(candidates, data_model)
    logging.info('calculated fieldDistances in %s seconds',
                 str(time.time() - t_train))

    seen_indices = set()

    while finished == False:
        logging.info('finding the next uncertain pair ...')
        uncertain_indices = findUncertainPairs(field_distances,
                                               data_model,
                                               (len(duplicates)/
                                                (len(nonduplicates)+1.0)))

        for uncertain_index in uncertain_indices:
            if uncertain_index not in seen_indices:
                seen_indices.add(uncertain_index)
                break

        uncertain_pairs = [candidates[uncertain_index]]

        (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs, data_model, training_data)

        if len(training_data) > 0:

            data_model = core.trainModel(training_data, data_model, .1)
        else:
            raise ValueError('No training pairs given')

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)