Beispiel #1
0
    def duplicateClusters(self, blocks, threshold=.5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """

        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        candidates = (pair for block in blocks for pair in itertools.combinations(block, 2))
        self.dupes = core.scoreDuplicates(candidates, self.data_model, threshold)
        clusters = clustering.cluster(self.dupes, cluster_threshold)

        return clusters
Beispiel #2
0
    def duplicateClusters(self, blocks, threshold=0.5, parallel=False):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """

        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7
        candidates = (pair for block in blocks for pair in itertools.combinations(block, 2))

        if parallel == True:
            global globalThreshold
            globalThreshold = threshold
            global globalDataModel
            globalDataModel = self.data_model

            pool = Pool(processes=self.processes)

            start = time.time()
            self.dupes = itertools.chain.from_iterable(
                pool.imap(_mapScoreDuplicates, self._splitEvery(100, candidates))
            )
            elapsed = time.time() - start
            print "Parallel scoreDuplicates with", self.processes, "processes takes :", elapsed
        else:
            start = time.time()
            self.dupes = core.scoreDuplicates(candidates, self.data_model, threshold)
            elapsed = time.time() - start
            print "Serial scoreDuplicates takes : ", elapsed

        clusters = clustering.cluster(self.dupes, cluster_threshold)

        return clusters
Beispiel #3
0
    def duplicateClusters(self,
                          blocks,
                          data,
                          constrained_matching=False,
                          threshold=.5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """

        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        blocked_keys, blocked_records = core.split(
            (block.keys(), block.values()) for block in blocks)

        candidate_keys = core.blockedPairs(blocked_keys, constrained_matching,
                                           data)
        candidate_records = core.blockedPairs(blocked_records,
                                              constrained_matching)

        self.dupes = core.scoreDuplicates(candidate_keys, candidate_records,
                                          self.data_model, threshold)

        if constrained_matching:
            clusters = clustering.clusterConstrained(self.dupes,
                                                     cluster_threshold)
        else:
            clusters = clustering.cluster(self.dupes, cluster_threshold)

        return clusters
Beispiel #4
0
    def duplicateClusters(self, blocks, threshold=.5):
        """
        Partitions blocked data and returns a list of clusters, where
        each cluster is a tuple of record ids

        Keyword arguments:
        blocks --     Sequence of tuples of records, where each
                      tuple is a set of records covered by a blocking
                      predicate
                                          
        threshold --  Number between 0 and 1 (default is .5). We will
                      only consider as duplicates record pairs as
                      duplicates if their estimated duplicate likelihood is
                      greater than the threshold.

                      Lowering the number will increase recall, raising it
                      will increase precision
                              

        """

        # Setting the cluster threshold this ways is not principled,
        # but seems to reliably help performance
        cluster_threshold = threshold * 0.7

        
        blocked_keys, blocked_records = core.split((block.keys(),
                                                    block.values())
                                                   for block in blocks)


        candidate_keys = core.blockedPairs(blocked_keys)
        candidate_records = core.blockedPairs(blocked_records)

        candidate_keys, ids = itertools.tee(candidate_keys)
        peek = ids.next()
        id_type = type(peek[0])
        ids = itertools.chain([peek], ids)
        
        self.dupes = core.scoreDuplicates(candidate_keys,
                                          candidate_records,
                                          id_type,
                                          self.data_model,
                                          threshold)
        clusters = clustering.cluster(self.dupes, id_type, cluster_threshold)

        return clusters
print "comparisons."

print "Learned Weights"
for k1, v1 in data_model.items() :
  try:
    for k2, v2 in v1.items() :
      print (k2, v2['weight'])
  except :
    print (k1, v1)

print ""

print "finding duplicates ..."
print ""
dupes = core.scoreDuplicates(candidates, data_d, data_model, .5)
clustered_dupes = clustering.cluster(dupes, estimated_dupe_fraction = 0.4)

print "# duplicate sets"
print len(clustered_dupes)

orig_data = {}
with open(inputFile) as f :
  reader = csv.reader(f)
  reader.next()
  for row_id, row in enumerate(reader) :
    orig_data[row_id] = row
    

with open("output/TL_dupes_list_" + str(time.time()) + ".csv","w") as f :
  writer = csv.writer(f)
  heading_row = header
print "Learned Weights"
for k1, v1 in data_model.items() :
  try:
    for k2, v2 in v1.items() :
      print (k2, v2['weight'])
  except :
    print (k1, v1)

print ""

print "finding duplicates ..."
print ""

dupes = scoreDuplicates(candidates, data_d, data_model)
clustered_dupes = cluster(dupes, .2) 

# dupe_ids = set([frozenset(dupe_pair[0]) for dupe_pair in dupes])
# true_positives = dupe_ids & duplicates_s
# false_positives = dupe_ids - duplicates_s
# uncovered_dupes = duplicates_s - dupe_ids
# 
# print "False negatives" 
# for pair in uncovered_dupes :
#        print ""
#        for instance in tuple(pair) :
#          print data_d[instance].values()
# 
# print "____________________________________________"
# print "False positives" 
#