Beispiel #1
0
    def matchBlocks(self, blocks, threshold=.5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """
        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        candidate_records = self._blockedPairs(blocks)
        
        self.matches = core.scoreDuplicates(candidate_records,
                                            self.data_model,
                                            self.num_processes,
                                            threshold)

        clusters = self._cluster(self.matches, cluster_threshold)
        
        return clusters
Beispiel #2
0
    def duplicateClusters(self, blocks, threshold=.5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """

        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        candidates = (pair for block in blocks for pair in itertools.combinations(block, 2))
        self.dupes = core.scoreDuplicates(candidates, self.data_model, threshold)
        clusters = clustering.cluster(self.dupes, cluster_threshold)

        return clusters
Beispiel #3
0
    def matchBlocks(self, blocks, threshold=0.5, *args, **kwargs):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:

        blocks -- Sequence of tuples of records, where each tuple is a
                  set of records covered by a blocking predicate

        threshold -- Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate
                      likelihood is greater than the threshold.

                      Lowering the number will increase recall,
                      raising it will increase precision

        """
        candidate_records = self._blockedPairs(blocks)

        matches = core.scoreDuplicates(candidate_records, self.data_model, self.classifier, self.num_cores, threshold)

        logger.debug("matching done, begin clustering")

        clusters = self._cluster(matches, threshold, *args, **kwargs)

        try:
            match_file = matches.filename
            del matches
            os.remove(match_file)
        except AttributeError:
            pass

        return clusters
Beispiel #4
0
    def duplicateClusters(self, blocks, threshold=0.5, parallel=False):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """

        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7
        candidates = (pair for block in blocks for pair in itertools.combinations(block, 2))

        if parallel == True:
            global globalThreshold
            globalThreshold = threshold
            global globalDataModel
            globalDataModel = self.data_model

            pool = Pool(processes=self.processes)

            start = time.time()
            self.dupes = itertools.chain.from_iterable(
                pool.imap(_mapScoreDuplicates, self._splitEvery(100, candidates))
            )
            elapsed = time.time() - start
            print "Parallel scoreDuplicates with", self.processes, "processes takes :", elapsed
        else:
            start = time.time()
            self.dupes = core.scoreDuplicates(candidates, self.data_model, threshold)
            elapsed = time.time() - start
            print "Serial scoreDuplicates takes : ", elapsed

        clusters = clustering.cluster(self.dupes, cluster_threshold)

        return clusters
Beispiel #5
0
    def duplicateClusters(self,
                          blocks,
                          data,
                          constrained_matching=False,
                          threshold=.5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """

        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        blocked_keys, blocked_records = core.split(
            (block.keys(), block.values()) for block in blocks)

        candidate_keys = core.blockedPairs(blocked_keys, constrained_matching,
                                           data)
        candidate_records = core.blockedPairs(blocked_records,
                                              constrained_matching)

        self.dupes = core.scoreDuplicates(candidate_keys, candidate_records,
                                          self.data_model, threshold)

        if constrained_matching:
            clusters = clustering.clusterConstrained(self.dupes,
                                                     cluster_threshold)
        else:
            clusters = clustering.cluster(self.dupes, cluster_threshold)

        return clusters
Beispiel #6
0
    def duplicateClusters(self, blocks, threshold=.5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """

        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        
        blocked_keys, blocked_records = core.split((block.keys(),
                                                    block.values())
                                                   for block in blocks)


        candidate_keys = core.blockedPairs(blocked_keys)
        candidate_records = core.blockedPairs(blocked_records)

        candidate_keys, ids = itertools.tee(candidate_keys)
        peek = ids.next()
        id_type = type(peek[0])
        ids = itertools.chain([peek], ids)
        
        self.dupes = core.scoreDuplicates(candidate_keys,
                                          candidate_records,
                                          id_type,
                                          self.data_model,
                                          threshold)
        clusters = clustering.cluster(self.dupes, id_type, cluster_threshold)

        return clusters
Beispiel #7
0
    def matchBlocks(self,
                    blocks,
                    threshold=.5,
                    *args,
                    **kwargs):  # pragma : no cover
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """
        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        candidate_records = self._blockedPairs(blocks)

        matches = core.scoreDuplicates(candidate_records, self.data_model,
                                       self.classifier, self.num_cores,
                                       threshold)

        logger.debug("matching done, begin clustering")

        clusters = self._cluster(matches, cluster_threshold, *args, **kwargs)

        try:
            match_file = matches.filename
            del matches
            os.remove(match_file)
        except AttributeError:
            pass

        return clusters
Beispiel #8
0
    def thresholdBlocks(self, blocks, recall_weight=1.5):  # pragma: nocover
        """
        Returns the threshold that maximizes the expected F score, a
        weighted average of precision and recall for a sample of
        blocked data.

        Arguments:

        blocks -- Sequence of tuples of records, where each tuple is a
                  set of records covered by a blocking predicate

        recall_weight -- Sets the tradeoff between precision and
                         recall. I.e. if you care twice as much about
                         recall as you do precision, set recall_weight
                         to 2.

        """
        candidate_records = itertools.chain.from_iterable(
            self._blockedPairs(blocks))

        probability = core.scoreDuplicates(candidate_records, self.data_model,
                                           self.classifier,
                                           self.num_cores)['score']

        probability = probability.copy()
        probability.sort()
        probability = probability[::-1]

        expected_dupes = numpy.cumsum(probability)

        recall = expected_dupes / expected_dupes[-1]
        precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1)

        score = recall * precision / (recall + recall_weight**2 * precision)

        i = numpy.argmax(score)

        logger.info('Maximum expected recall and precision')
        logger.info('recall: %2.3f', recall[i])
        logger.info('precision: %2.3f', precision[i])
        logger.info('With threshold: %2.3f', probability[i])

        return probability[i]
Beispiel #9
0
    def thresholdBlocks(self, blocks, recall_weight=1.5):  # pragma: nocover
        """
        Returns the threshold that maximizes the expected F score, a
        weighted average of precision and recall for a sample of
        blocked data.

        Arguments:

        blocks -- Sequence of tuples of records, where each tuple is a
                  set of records covered by a blocking predicate

        recall_weight -- Sets the tradeoff between precision and
                         recall. I.e. if you care twice as much about
                         recall as you do precision, set recall_weight
                         to 2.

        """
        candidate_records = itertools.chain.from_iterable(self._blockedPairs(blocks))

        probability = core.scoreDuplicates(candidate_records,
                                           self.data_model,
                                           self.classifier,
                                           self.num_cores)['score']

        probability = probability.copy()
        probability.sort()
        probability = probability[::-1]

        expected_dupes = numpy.cumsum(probability)

        recall = expected_dupes / expected_dupes[-1]
        precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1)

        score = recall * precision / (recall + recall_weight ** 2 * precision)

        i = numpy.argmax(score)

        logger.info('Maximum expected recall and precision')
        logger.info('recall: %2.3f', recall[i])
        logger.info('precision: %2.3f', precision[i])
        logger.info('With threshold: %2.3f', probability[i])

        return probability[i]
Beispiel #10
0
    def matchBlocks(self, blocks, threshold=.5, *args, **kwargs):
        """
        Partitions blocked data and generates a sequence of clusters,
        where each cluster is a tuple of record ids

        Keyword arguments:

        blocks -- Sequence of tuples of records, where each tuple is a
                  set of records covered by a blocking predicate

        threshold -- Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate
                      likelihood is greater than the threshold.

                      Lowering the number will increase recall,
                      raising it will increase precision

        """
        candidate_records = itertools.chain.from_iterable(
            self._blockedPairs(blocks))

        matches = core.scoreDuplicates(candidate_records,
                                       self.data_model,
                                       self.classifier,
                                       self.num_cores,
                                       threshold=0)

        logger.debug("matching done, begin clustering")

        for cluster in self._cluster(matches, threshold, *args, **kwargs):
            yield cluster

        try:
            match_file = matches.filename
            del matches
            os.remove(match_file)
        except AttributeError:
            pass
Beispiel #11
0
    def thresholdBlocks(self, blocks, recall_weight=1.5):
        """
        Returns the threshold that maximizes the expected F score,
        a weighted average of precision and recall for a sample of
        blocked data. 

        Keyword arguments:
        blocks --        Sequence of tuples of records, where each
                         tuple is a set of records covered by a blocking
                         predicate

        recall_weight -- Sets the tradeoff between precision and
                         recall. I.e. if you care twice as much about
                         recall as you do precision, set recall_weight
                         to 2.
        """

        probability = core.scoreDuplicates(self._blockedPairs(blocks), self.data_model, self.num_processes)["score"]

        probability.sort()
        probability = probability[::-1]

        expected_dupes = numpy.cumsum(probability)

        recall = expected_dupes / expected_dupes[-1]
        precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1)

        score = recall * precision / (recall + recall_weight ** 2 * precision)

        i = numpy.argmax(score)

        logger.info("Maximum expected recall and precision")
        logger.info("recall: %2.3f", recall[i])
        logger.info("precision: %2.3f", precision[i])
        logger.info("With threshold: %2.3f", probability[i])

        return probability[i]
Beispiel #12
0
    def matchBlocks(self, blocks, threshold=.5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """
        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        candidate_records = self._blockedPairs(blocks)
        
        self.matches = core.scoreDuplicates(candidate_records,
                                            self.data_model,
                                            self.num_processes,
                                            threshold)

        logger.info("matching done, begin clustering")

        clusters = self._cluster(self.matches, 
                                 cluster_threshold)
        
        return clusters
Beispiel #13
0
def _mapScoreDuplicates(candidates):
    return core.scoreDuplicates(candidates, globalDataModel, globalThreshold)
print len(candidates),
print "comparisons."

print "Learned Weights"
for k1, v1 in data_model.items() :
  try:
    for k2, v2 in v1.items() :
      print (k2, v2['weight'])
  except :
    print (k1, v1)

print ""

print "finding duplicates ..."
print ""
dupes = core.scoreDuplicates(candidates, data_d, data_model, .5)
clustered_dupes = clustering.cluster(dupes, estimated_dupe_fraction = 0.4)

print "# duplicate sets"
print len(clustered_dupes)

orig_data = {}
with open(inputFile) as f :
  reader = csv.reader(f)
  reader.next()
  for row_id, row in enumerate(reader) :
    orig_data[row_id] = row
    

with open("output/TL_dupes_list_" + str(time.time()) + ".csv","w") as f :
  writer = csv.writer(f)