def duplicateClusters(self, blocks, pairwise_threshold = .5, cluster_threshold = .5): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocked_data -- Dictionary where the keys are blocking predicates and the values are tuples of records covered by that predicate. pairwise_threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates ecord pairs as duplicates if their estimated duplicate likelihood is greater than the pairwise threshold. cluster_threshold -- Number between 0 and 1 (default is .5). Lowering the number will increase precision, raising it will increase recall """ candidates = (pair for block in blocks for pair in itertools.combinations(block, 2)) self.dupes = core.scoreDuplicates(candidates, self.data_model, pairwise_threshold) clusters = clustering.hierarchical.cluster(self.dupes, cluster_threshold) return clusters
def semiSupervisedNonDuplicates(data_sample, data_model, nonduplicate_confidence_threshold=.7, sample_size=2000): confidence = 1 - nonduplicate_confidence_threshold # Nearly all possible combinations of pairs will not be # duplicates. With high probability there will be N distinct pairs # within a sample of size 2N if len(data_sample) > 2 * sample_size : data_sample = random.sample(data_sample, sample_size * 2) scores = core.scoreDuplicates(data_sample, data_model, threshold=0) indices = numpy.where(scores['score'] < confidence)[0] if len(indices) > sample_size : indices = numpy.random.choice(indices, sample_size, replace=False) non_dupes = [(data_sample[i][0][1], data_sample[i][1][1]) for i in indices] if len(non_dupes) < sample_size : logging.warning("Only %d confidently distinct pairs for block training", len(non_dupes)) return non_dupes
def duplicateClusters(self, blocks, threshold=0.5): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 candidates = (pair for block in blocks for pair in itertools.combinations(block, 2)) self.dupes = core.scoreDuplicates(candidates, self.data_model, threshold) clusters = clustering.hierarchical.cluster(self.dupes, cluster_threshold) return clusters