Esempio n. 1
0
    def duplicateClusters(self, blocks, threshold=0.5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """

        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        blocked_keys, blocked_records = core.split((block.keys(), block.values()) for block in blocks)

        candidate_keys = core.blockedPairs(blocked_keys)
        candidate_records = core.blockedPairs(blocked_records)

        self.dupes = core.scoreDuplicates(candidate_keys, candidate_records, self.data_model, threshold)
        clusters = clustering.cluster(self.dupes, cluster_threshold)

        return clusters
Esempio n. 2
0
    def duplicateClusters(self,
                          blocks,
                          data,
                          constrained_matching=False,
                          threshold=.5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """

        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        blocked_keys, blocked_records = core.split(
            (block.keys(), block.values()) for block in blocks)

        candidate_keys = core.blockedPairs(blocked_keys, constrained_matching,
                                           data)
        candidate_records = core.blockedPairs(blocked_records,
                                              constrained_matching)

        self.dupes = core.scoreDuplicates(candidate_keys, candidate_records,
                                          self.data_model, threshold)

        if constrained_matching:
            clusters = clustering.clusterConstrained(self.dupes,
                                                     cluster_threshold)
        else:
            clusters = clustering.cluster(self.dupes, cluster_threshold)

        return clusters
Esempio n. 3
0
    def goodThreshold(self,
                      blocks,
                      constrained_matching=False,
                      recall_weight=1.5):
        """
        Returns the threshold that maximizes the expected F score,
        a weighted average of precision and recall for a sample of
        blocked data. 

        Keyword arguments:
        blocks --        Sequence of tuples of records, where each
                         tuple is a set of records covered by a blocking
                         predicate

        recall_weight -- Sets the tradeoff between precision and
                         recall. I.e. if you care twice as much about
                         recall as you do precision, set recall_weight
                         to 2.
        """

        blocked_records = (block.values() for block in blocks)

        candidates = core.blockedPairs(blocked_records, constrained_matching)

        field_distances = core.fieldDistances(candidates, self.data_model)
        probability = core.scorePairs(field_distances, self.data_model)

        probability.sort()
        probability = probability[::-1]

        expected_dupes = numpy.cumsum(probability)

        recall = expected_dupes / expected_dupes[-1]
        precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1)

        score = recall * precision / (recall + recall_weight**2 * precision)

        i = numpy.argmax(score)

        logging.info('Maximum expected recall and precision')
        logging.info('recall: %2.3f', recall[i])
        logging.info('precision: %2.3f', precision[i])
        logging.info('With threshold: %2.3f', probability[i])

        return probability[i]
Esempio n. 4
0
    def goodThreshold(self, blocks, recall_weight=1.5):
        """
        Returns the threshold that maximizes the expected F score,
        a weighted average of precision and recall for a sample of
        blocked data. 

        Keyword arguments:
        blocks --        Sequence of tuples of records, where each
                         tuple is a set of records covered by a blocking
                         predicate

        recall_weight -- Sets the tradeoff between precision and
                         recall. I.e. if you care twice as much about
                         recall as you do precision, set recall_weight
                         to 2.
        """

        blocked_records = (block.values() for block in blocks)

        candidates = core.blockedPairs(blocked_records)

        field_distances = core.fieldDistances(candidates, self.data_model)
        probability = core.scorePairs(field_distances, self.data_model)

        probability.sort()
        probability = probability[::-1]

        expected_dupes = numpy.cumsum(probability)

        recall = expected_dupes / expected_dupes[-1]
        precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1)

        score = recall * precision / (recall + recall_weight ** 2 * precision)

        i = numpy.argmax(score)

        logging.info("Maximum expected recall and precision")
        logging.info("recall: %2.3f", recall[i])
        logging.info("precision: %2.3f", precision[i])
        logging.info("With threshold: %2.3f", probability[i])

        return probability[i]