Esempio n. 1
0
    def duplicateClusters(self,
                          blocks,
                          pairwise_threshold = .5,
                          cluster_threshold = .5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocked_data --       Dictionary where the keys are blocking predicates 
                              and the values are tuples of records covered by that 
                              predicate.
        pairwise_threshold -- Number between 0 and 1 (default is .5). We will only 
                              consider as duplicates  ecord pairs as duplicates if 
                              their estimated duplicate likelihood is greater than 
                              the pairwise threshold. 
        cluster_threshold --  Number between 0 and 1 (default is .5). Lowering the 
                              number will increase precision, raising it will increase
                              recall

        """

        candidates = (pair for block in blocks
                      for pair in itertools.combinations(block, 2))
        
        self.dupes = core.scoreDuplicates(candidates, 
                                          self.data_model,
                                          pairwise_threshold)

        clusters = clustering.hierarchical.cluster(self.dupes, cluster_threshold)

        return clusters
Esempio n. 2
0
def semiSupervisedNonDuplicates(data_sample,
                                data_model,
                                nonduplicate_confidence_threshold=.7,
                                sample_size=2000):

    confidence = 1 - nonduplicate_confidence_threshold

    # Nearly all possible combinations of pairs will not be
    # duplicates. With high probability there will be N distinct pairs
    # within a sample of size 2N
    if len(data_sample) > 2 * sample_size :
        data_sample = random.sample(data_sample, sample_size * 2)

    scores = core.scoreDuplicates(data_sample,
                                  data_model,
                                  threshold=0)


    indices = numpy.where(scores['score'] < confidence)[0]

    if len(indices) > sample_size :
        indices = numpy.random.choice(indices,
                                      sample_size,
                                      replace=False)

    non_dupes = [(data_sample[i][0][1],
                  data_sample[i][1][1])
                 for i in indices]

    if len(non_dupes) < sample_size :
        logging.warning("Only %d confidently distinct pairs for block training",
                        len(non_dupes))

    return non_dupes
Esempio n. 3
0
    def duplicateClusters(self, blocks, threshold=0.5):

        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """
        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        candidates = (pair for block in blocks for pair in itertools.combinations(block, 2))

        self.dupes = core.scoreDuplicates(candidates, self.data_model, threshold)

        clusters = clustering.hierarchical.cluster(self.dupes, cluster_threshold)

        return clusters