def cluster(self, threshold): # Keep the matches sorted in a heap heap = [] for y in xrange(self.clusterDict.getSize()): for x in xrange(y): if x != y: c = similarity(self.clusterDict.getToken(x).lower(), self.clusterDict.getToken(y).lower()) if c >= threshold: heappush(heap, ((1.0 - c), [x, y])) QtCore.QCoreApplication.processEvents() for i in xrange(self.clusterDict.getSize()): word, count = self.clusterDict.getWordAndCount(i) if word and count > 1: self.clusterBins[self.clusterCount] = [i] self.idClusterIndex[i] = self.clusterCount self.clusterCount = self.clusterCount + 1 for i in xrange(len(heap)): c, pair = heappop(heap) c = 1.0 - c try: match0 = self.idClusterIndex[pair[0]] except: match0 = -1 try: match1 = self.idClusterIndex[pair[1]] except: match1 = -1 # if neither item is in a cluster, make a new cluster if match0 == -1 and match1 == -1: self.clusterBins[self.clusterCount] = [pair[0], pair[1]] self.idClusterIndex[pair[0]] = self.clusterCount self.idClusterIndex[pair[1]] = self.clusterCount self.clusterCount = self.clusterCount + 1 continue # If cluster0 is in a bin, stick the other match into that bin if match0 >= 0 and match1 < 0: self.clusterBins[match0].append(pair[1]) self.idClusterIndex[pair[1]] = match0 continue # If cluster1 is in a bin, stick the other match into that bin if match1 >= 0 and match0 < 0: self.clusterBins[match1].append(pair[0]) self.idClusterIndex[pair[0]] = match1 continue # If both matches are already in two different clusters, merge the clusters if match1 != match0: self.clusterBins[match0].extend(self.clusterBins[match1]) for match in self.clusterBins[match1]: self.idClusterIndex[match] = match0 del self.clusterBins[match1]
def test_correct(self): self.assertEqual(similarity(u"K!", u"K!"), 1.0) self.assertEqual(similarity(u"BBB", u"AAA"), 0.0) self.assertAlmostEqual(similarity(u"ABC", u"ABB"), 0.7, 1)
def cluster(self, threshold): # Keep the matches sorted in a heap heap = [] for y in range(self.clusterDict.getSize()): for x in range(y): if x != y: c = similarity( self.clusterDict.getToken(x).lower(), self.clusterDict.getToken(y).lower()) if c >= threshold: heappush(heap, ((1.0 - c), [x, y])) QtCore.QCoreApplication.processEvents() for i in range(self.clusterDict.getSize()): word, count = self.clusterDict.getWordAndCount(i) if word and count > 1: self.clusterBins[self.clusterCount] = [i] self.idClusterIndex[i] = self.clusterCount self.clusterCount = self.clusterCount + 1 for i in range(len(heap)): c, pair = heappop(heap) c = 1.0 - c try: match0 = self.idClusterIndex[pair[0]] except: match0 = -1 try: match1 = self.idClusterIndex[pair[1]] except: match1 = -1 # if neither item is in a cluster, make a new cluster if match0 == -1 and match1 == -1: self.clusterBins[self.clusterCount] = [pair[0], pair[1]] self.idClusterIndex[pair[0]] = self.clusterCount self.idClusterIndex[pair[1]] = self.clusterCount self.clusterCount = self.clusterCount + 1 continue # If cluster0 is in a bin, stick the other match into that bin if match0 >= 0 and match1 < 0: self.clusterBins[match0].append(pair[1]) self.idClusterIndex[pair[1]] = match0 continue # If cluster1 is in a bin, stick the other match into that bin if match1 >= 0 and match0 < 0: self.clusterBins[match1].append(pair[0]) self.idClusterIndex[pair[0]] = match1 continue # If both matches are already in two different clusters, merge the clusters if match1 != match0: self.clusterBins[match0].extend(self.clusterBins[match1]) for match in self.clusterBins[match1]: self.idClusterIndex[match] = match0 del self.clusterBins[match1]
def cluster(self, threshold, tagger=None): # Keep the matches sorted in a heap heap = [] num_files = self.cluster_dict.get_size() # 20 evenly spaced indexes of files being clustered, used as checkpoints for every 5% progress status_update_steps = ProgressCheckpoints(num_files, 20) for y in process_events_iter(range(num_files)): token_y = self.cluster_dict.get_token(y).lower() for x in range(y): if x != y: token_x = self.cluster_dict.get_token(x).lower() c = similarity(token_x, token_y) if c >= threshold: heappush(heap, ((1.0 - c), [x, y])) word, count = self.cluster_dict.get_word_and_count(y) if word and count > 1: self.cluster_bins[self.cluster_count] = [y] self.index_id_cluster[y] = self.cluster_count self.cluster_count = self.cluster_count + 1 if tagger and status_update_steps.is_checkpoint(y): statusmsg = N_( "Clustering - step %(step)d/3: %(cluster_type)s (%(update)d%%)" ) mparams = { 'step': self.cluster_type.value, 'cluster_type': _(self._cluster_type_label()), 'update': status_update_steps.progress(y), } tagger.window.set_statusbar_message(statusmsg, mparams) for i in range(len(heap)): c, pair = heappop(heap) c = 1.0 - c try: match0 = self.index_id_cluster[pair[0]] except BaseException: match0 = -1 try: match1 = self.index_id_cluster[pair[1]] except BaseException: match1 = -1 # if neither item is in a cluster, make a new cluster if match0 == -1 and match1 == -1: self.cluster_bins[self.cluster_count] = [pair[0], pair[1]] self.index_id_cluster[pair[0]] = self.cluster_count self.index_id_cluster[pair[1]] = self.cluster_count self.cluster_count = self.cluster_count + 1 continue # If cluster0 is in a bin, stick the other match into that bin if match0 >= 0 and match1 < 0: self.cluster_bins[match0].append(pair[1]) self.index_id_cluster[pair[1]] = match0 continue # If cluster1 is in a bin, stick the other match into that bin if match1 >= 0 and match0 < 0: self.cluster_bins[match1].append(pair[0]) self.index_id_cluster[pair[0]] = match1 continue # If both matches are already in two different clusters, merge the clusters if match1 != match0: self.cluster_bins[match0].extend(self.cluster_bins[match1]) for match in self.cluster_bins[match1]: self.index_id_cluster[match] = match0 del self.cluster_bins[match1]
def cluster(self, threshold): # Keep the matches sorted in a heap heap = [] for y in xrange(self.clusterDict.getSize()): for x in xrange(y): if x != y: c = similarity( self.clusterDict.getToken(x).lower(), self.clusterDict.getToken(y).lower()) #print "'%s' - '%s' = %f" % ( # self.clusterDict.getToken(x).encode('utf-8', 'replace').lower(), # self.clusterDict.getToken(y).encode('utf-8', 'replace').lower(), c) if c >= threshold: heappush(heap, ((1.0 - c), [x, y])) QtCore.QCoreApplication.processEvents() for i in xrange(self.clusterDict.getSize()): word, count = self.clusterDict.getWordAndCount(i) if word and count > 1: self.clusterBins[self.clusterCount] = [i] self.idClusterIndex[i] = self.clusterCount self.clusterCount = self.clusterCount + 1 #print "init ", #self.printCluster(self.clusterCount - 1) for i in xrange(len(heap)): c, pair = heappop(heap) c = 1.0 - c try: match0 = self.idClusterIndex[pair[0]] except: match0 = -1 try: match1 = self.idClusterIndex[pair[1]] except: match1 = -1 # if neither item is in a cluster, make a new cluster if match0 == -1 and match1 == -1: self.clusterBins[self.clusterCount] = [pair[0], pair[1]] self.idClusterIndex[pair[0]] = self.clusterCount self.idClusterIndex[pair[1]] = self.clusterCount self.clusterCount = self.clusterCount + 1 #print "new ", #self.printCluster(self.clusterCount - 1) continue # If cluster0 is in a bin, stick the other match into that bin if match0 >= 0 and match1 < 0: self.clusterBins[match0].append(pair[1]) self.idClusterIndex[pair[1]] = match0 #print "add '%s' to cluster " % (self.clusterDict.getWord(pair[0])), #self.printCluster(match0) continue # If cluster1 is in a bin, stick the other match into that bin if match1 >= 0 and match0 < 0: self.clusterBins[match1].append(pair[0]) self.idClusterIndex[pair[0]] = match1 #print "add '%s' to cluster " % (self.clusterDict.getWord(pair[1])), #self.printCluster(match0) continue # If both matches are already in two different clusters, merge the clusters if match1 != match0: self.clusterBins[match0].extend(self.clusterBins[match1]) for match in self.clusterBins[match1]: self.idClusterIndex[match] = match0 #print "col cluster %d into cluster" % (match1), #self.printCluster(match0) del self.clusterBins[match1] return self.clusterBins
def test_correct(self): self.failUnlessEqual(similarity(u"K!", u"K!"), 1.0) self.failUnlessEqual(similarity(u"BBB", u"AAA"), 0.0) self.failUnlessAlmostEqual(similarity(u"ABC", u"ABB"), 0.7, 1)