Ejemplo n.º 1
0
def semiSupervisedNonDuplicates(data_d, data_model, 
                                nonduplicate_confidence_threshold=.7,
                                sample_size = 2000):


    pair_combinations = list(combinations(data_d.iteritems(), 2))

    if len(pair_combinations) <= sample_size :
        return pair_combinations

    shuffle(pair_combinations)
    
    confident_distinct_pairs = []
    n_distinct_pairs = 0
    for pair in pair_combinations :

        pair_distance = core.recordDistances([pair], data_model)
        score = core.scorePairs(pair_distance, data_model)


        if score < (1 - nonduplicate_confidence_threshold):
            key_pair, value_pair = zip(*pair)
            confident_distinct_pairs.append(value_pair)
            n_distinct_pairs += 1
            if n_distinct_pairs == sample_size :
                return confident_distinct_pairs
Ejemplo n.º 2
0
    def distinctPairs():
        data_slice = data_sample[0:sample_size]
        pair_distance = core.fieldDistances(data_slice, data_model)
        scores = core.scorePairs(pair_distance, data_model)

        sample_n = 0
        for score, pair in zip(scores, data_sample):
            if score < confidence:
                yield pair
                sample_n += 1

        if sample_n < sample_size and len(data_sample) > sample_size:
            for pair in data_sample[sample_size:]:
                pair_distance = core.fieldDistances([pair], data_model)
                score = core.scorePairs(pair_distance, data_model)

                if score < confidence:
                    yield (pair)
Ejemplo n.º 3
0
    def distinctPairs() :
        data_slice = data_sample[0:sample_size]
        pair_distance = core.fieldDistances(data_slice, data_model)
        scores = core.scorePairs(pair_distance, data_model)

        sample_n = 0
        for score, pair in zip(scores, data_sample) :
            if score < confidence :
                yield pair
                sample_n += 1

        if sample_n < sample_size and len(data_sample) > sample_size :
            for pair in data_sample[sample_size:] :
                pair_distance = core.fieldDistances([pair], data_model)
                score = core.scorePairs(pair_distance, data_model)
                
                if score < confidence :
                    yield (pair)
Ejemplo n.º 4
0
def findUncertainPairs(record_distances, data_model) :

  probability = core.scorePairs(record_distances, data_model)

  uncertainties = (probability * numpy.log2(probability)
                   + (1-probability) * numpy.log(1-probability)
                   )
  
  return(numpy.argsort(uncertainties))
Ejemplo n.º 5
0
def findUncertainPairs(field_distances, data_model):
    """
    Given a set of field distances and a data model return the
    indices of the record pairs in order of uncertainty. For example,
    the first indices corresponds to the record pair where we have the
    least certainty whether the pair are duplicates or distinct.
    """

    probability = core.scorePairs(field_distances, data_model)

    uncertainties = (probability * numpy.log2(probability) 
                     + (1 - probability) * numpy.log2(1 - probability))

    return numpy.argsort(uncertainties)
Ejemplo n.º 6
0
def findUncertainPairs(field_distances, data_model):
    """
    Given a set of field distances and a data model return the
    indices of the record pairs in order of uncertainty. For example,
    the first indices corresponds to the record pair where we have the
    least certainty whether the pair are duplicates or distinct.
    """

    probability = core.scorePairs(field_distances, data_model)

    uncertainties = (probability * numpy.log2(probability) 
                     + (1 - probability) * numpy.log2(1 - probability))

    return numpy.argsort(uncertainties)
Ejemplo n.º 7
0
def findUncertainPairs(field_distances, data_model, bias=0.5):
    """
    Given a set of field distances and a data model return the
    indices of the record pairs in order of uncertainty. For example,
    the first indices corresponds to the record pair where we have the
    least certainty whether the pair are duplicates or distinct.
    """

    probability = core.scorePairs(field_distances, data_model)

    p_max = (1.0 - bias)
    logging.info(p_max)

    informativity = numpy.copy(probability)
    informativity[probability < p_max] /= p_max
    informativity[probability >= p_max] = (1 - probability[probability >= p_max])/(1-p_max)


    return numpy.argsort(-informativity)
Ejemplo n.º 8
0
def semiSupervisedNonDuplicates(data_d, data_model,
                                nonduplicate_confidence_threshold=.7):

    # this is an expensive call and we're making it multiple times
    pairs = allCandidates(data_d)
    record_distances = core.recordDistances(pairs, data_d, data_model)

    confident_nondupes_ids = []
    scored_pairs = core.scorePairs(record_distances, data_model)

    for (i, score) in enumerate(scored_pairs):
        if score < 1 - nonduplicate_confidence_threshold:
            confident_nondupes_ids.append(record_distances['pairs'][i])

    confident_nondupes_pairs = [(data_d[pair[0]], data_d[pair[1]])
                                for pair in
                                confident_nondupes_ids]

    return confident_nondupes_pairs
Ejemplo n.º 9
0
def findUncertainPairs(field_distances, data_model, bias=0.5):
    """
    Given a set of field distances and a data model return the
    indices of the record pairs in order of uncertainty. For example,
    the first indices corresponds to the record pair where we have the
    least certainty whether the pair are duplicates or distinct.
    """

    probability = core.scorePairs(field_distances, data_model)

    p_max = (1.0 - bias)
    logger.info(p_max)

    informativity = numpy.copy(probability)
    informativity[probability < p_max] /= p_max
    informativity[probability >= p_max] = (
        1 - probability[probability >= p_max]) / (1 - p_max)

    return numpy.argsort(-informativity)
Ejemplo n.º 10
0
def semiSupervisedNonDuplicates(data_sample,
                                data_model,
                                nonduplicate_confidence_threshold=.7,
                                sample_size=2000):

    if len(data_sample) <= sample_size:
        return data_sample

    confident_distinct_pairs = []
    n_distinct_pairs = 0
    for pair in data_sample:

        pair_distance = core.fieldDistances([pair], data_model)
        score = core.scorePairs(pair_distance, data_model)

        if score < 1 - nonduplicate_confidence_threshold:
            (key_pair, value_pair) = zip(*pair)
            confident_distinct_pairs.append(value_pair)
            n_distinct_pairs += 1
            if n_distinct_pairs == sample_size:
                return confident_distinct_pairs
Ejemplo n.º 11
0
    def goodThreshold(self, blocks, recall_weight=1.5):
        """
        Returns the threshold that maximizes the expected F score,
        a weighted average of precision and recall for a sample of
        blocked data. 

        Keyword arguments:
        blocks --        Sequence of tuples of records, where each
                         tuple is a set of records covered by a blocking
                         predicate

        recall_weight -- Sets the tradeoff between precision and
                         recall. I.e. if you care twice as much about
                         recall as you do precision, set recall_weight
                         to 2.
        """

        candidates = (pair for block in blocks for pair in itertools.combinations(block, 2))

        record_distances = core.recordDistances(candidates, self.data_model)
        probability = core.scorePairs(record_distances, self.data_model)

        probability.sort()
        probability = probability[::-1]

        expected_dupes = numpy.cumsum(probability)

        recall = expected_dupes / expected_dupes[-1]
        precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1)

        score = recall * precision / (recall + recall_weight ** 2 * precision)

        i = numpy.argmax(score)

        logging.info("Maximum expected recall and precision")
        logging.info("recall: %2.3f" % recall[i])
        logging.info("precision: %2.3f" % precision[i])
        logging.info("With threshold: %2.3f" % probability[i])

        return probability[i]