def semiSupervisedNonDuplicates(data_d, data_model, nonduplicate_confidence_threshold=.7, sample_size = 2000): pair_combinations = list(combinations(data_d.iteritems(), 2)) if len(pair_combinations) <= sample_size : return pair_combinations shuffle(pair_combinations) confident_distinct_pairs = [] n_distinct_pairs = 0 for pair in pair_combinations : pair_distance = core.recordDistances([pair], data_model) score = core.scorePairs(pair_distance, data_model) if score < (1 - nonduplicate_confidence_threshold): key_pair, value_pair = zip(*pair) confident_distinct_pairs.append(value_pair) n_distinct_pairs += 1 if n_distinct_pairs == sample_size : return confident_distinct_pairs
def distinctPairs(): data_slice = data_sample[0:sample_size] pair_distance = core.fieldDistances(data_slice, data_model) scores = core.scorePairs(pair_distance, data_model) sample_n = 0 for score, pair in zip(scores, data_sample): if score < confidence: yield pair sample_n += 1 if sample_n < sample_size and len(data_sample) > sample_size: for pair in data_sample[sample_size:]: pair_distance = core.fieldDistances([pair], data_model) score = core.scorePairs(pair_distance, data_model) if score < confidence: yield (pair)
def distinctPairs() : data_slice = data_sample[0:sample_size] pair_distance = core.fieldDistances(data_slice, data_model) scores = core.scorePairs(pair_distance, data_model) sample_n = 0 for score, pair in zip(scores, data_sample) : if score < confidence : yield pair sample_n += 1 if sample_n < sample_size and len(data_sample) > sample_size : for pair in data_sample[sample_size:] : pair_distance = core.fieldDistances([pair], data_model) score = core.scorePairs(pair_distance, data_model) if score < confidence : yield (pair)
def findUncertainPairs(record_distances, data_model) : probability = core.scorePairs(record_distances, data_model) uncertainties = (probability * numpy.log2(probability) + (1-probability) * numpy.log(1-probability) ) return(numpy.argsort(uncertainties))
def findUncertainPairs(field_distances, data_model): """ Given a set of field distances and a data model return the indices of the record pairs in order of uncertainty. For example, the first indices corresponds to the record pair where we have the least certainty whether the pair are duplicates or distinct. """ probability = core.scorePairs(field_distances, data_model) uncertainties = (probability * numpy.log2(probability) + (1 - probability) * numpy.log2(1 - probability)) return numpy.argsort(uncertainties)
def findUncertainPairs(field_distances, data_model, bias=0.5): """ Given a set of field distances and a data model return the indices of the record pairs in order of uncertainty. For example, the first indices corresponds to the record pair where we have the least certainty whether the pair are duplicates or distinct. """ probability = core.scorePairs(field_distances, data_model) p_max = (1.0 - bias) logging.info(p_max) informativity = numpy.copy(probability) informativity[probability < p_max] /= p_max informativity[probability >= p_max] = (1 - probability[probability >= p_max])/(1-p_max) return numpy.argsort(-informativity)
def semiSupervisedNonDuplicates(data_d, data_model, nonduplicate_confidence_threshold=.7): # this is an expensive call and we're making it multiple times pairs = allCandidates(data_d) record_distances = core.recordDistances(pairs, data_d, data_model) confident_nondupes_ids = [] scored_pairs = core.scorePairs(record_distances, data_model) for (i, score) in enumerate(scored_pairs): if score < 1 - nonduplicate_confidence_threshold: confident_nondupes_ids.append(record_distances['pairs'][i]) confident_nondupes_pairs = [(data_d[pair[0]], data_d[pair[1]]) for pair in confident_nondupes_ids] return confident_nondupes_pairs
def findUncertainPairs(field_distances, data_model, bias=0.5): """ Given a set of field distances and a data model return the indices of the record pairs in order of uncertainty. For example, the first indices corresponds to the record pair where we have the least certainty whether the pair are duplicates or distinct. """ probability = core.scorePairs(field_distances, data_model) p_max = (1.0 - bias) logger.info(p_max) informativity = numpy.copy(probability) informativity[probability < p_max] /= p_max informativity[probability >= p_max] = ( 1 - probability[probability >= p_max]) / (1 - p_max) return numpy.argsort(-informativity)
def semiSupervisedNonDuplicates(data_sample, data_model, nonduplicate_confidence_threshold=.7, sample_size=2000): if len(data_sample) <= sample_size: return data_sample confident_distinct_pairs = [] n_distinct_pairs = 0 for pair in data_sample: pair_distance = core.fieldDistances([pair], data_model) score = core.scorePairs(pair_distance, data_model) if score < 1 - nonduplicate_confidence_threshold: (key_pair, value_pair) = zip(*pair) confident_distinct_pairs.append(value_pair) n_distinct_pairs += 1 if n_distinct_pairs == sample_size: return confident_distinct_pairs
def goodThreshold(self, blocks, recall_weight=1.5): """ Returns the threshold that maximizes the expected F score, a weighted average of precision and recall for a sample of blocked data. Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate recall_weight -- Sets the tradeoff between precision and recall. I.e. if you care twice as much about recall as you do precision, set recall_weight to 2. """ candidates = (pair for block in blocks for pair in itertools.combinations(block, 2)) record_distances = core.recordDistances(candidates, self.data_model) probability = core.scorePairs(record_distances, self.data_model) probability.sort() probability = probability[::-1] expected_dupes = numpy.cumsum(probability) recall = expected_dupes / expected_dupes[-1] precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1) score = recall * precision / (recall + recall_weight ** 2 * precision) i = numpy.argmax(score) logging.info("Maximum expected recall and precision") logging.info("recall: %2.3f" % recall[i]) logging.info("precision: %2.3f" % precision[i]) logging.info("With threshold: %2.3f" % probability[i]) return probability[i]