class Cluster(): documents = [] dVectors = [] numTrials = 50 debug = True distanceMetric = 'cosine' def testing__init__(self): doc1 = {"music":2,"katyperry":2,"artpop":1,"ladygaga":1} doc2 = {"music":1,"johnny":1,"football":3,"basketball":2,"aggies":4} print self.distanceBetween(doc1,doc2) def __init__(self, tweets): # send tweets into index self.index = Index(tweets) # get documents from index self.documents = self.index.documents #print self.documents self.dVectors = [None]*len(self.documents) ''' we want a slightly easier way to deal with the documents, so we're translating them from what we had before (document objects) to dictionaries of words -> tfidf values so we don't have to recompute tfidf values again and again and again... ''' for d in sorted(self.documents.keys()): doc = self.documents[d] vecs = {} for posting in doc.getPostingsList().values(): vecs[posting.getTerm()] = self.getTFIDF(posting) ''' so now we have vectors of all words in the document... but maybe we just want the top x ''' #vecs = self.reduceDimensionality(vecs) self.dVectors[d] = vecs ''' just for testing ''' #self.dVectors = createDummyData() #self.documents = None print self.cosineScore(self.dVectors[1],self.dVectors[1]) ''' run kmeans for k = 2 ''' statsFor2 = self.runKMeans(2) ''' run kmeans for k = 4 ''' statsFor4 = self.runKMeans(4) ''' run kmeans for k = 6 ''' statsFor6 = self.runKMeans(6) ''' run kmeans for k = 8 ''' statsFor8 = self.runKMeans(8) ''' start printing stats ''' print "-------------------------------------------------------------------" distanceFromOrigin = 0 for document in self.dVectors: if self.distanceMetric == 'euclidean': distanceFromOrigin += self.distanceBetween({},document) elif self.distanceMetric == 'cosine': distanceFromOrigin += self.cosineScore({}, document) else: print 'you have not defined a distance metric' print 'STATISTICS REPORT FOR K = 1, WHERE THE CENTER IS THE ORIGIN:' print " RSS:", distanceFromOrigin self.printStats(statsFor2) self.printStats(statsFor4) self.printStats(statsFor6) self.printStats(statsFor8) def printStats(self, stats): print "-------------------------------------------------------------------" print 'STATISTICS REPORT FOR K =',stats['k'] print " AVERAGE # ITERATIONS:", stats['avgIterations'] print " BEST RSS (RESTART #",str(stats['bestRestartID'])+"):", stats['bestRSS'] print " BEST PURITY (RESTART #",str(stats['bestRestartID'])+"):", stats['bestPurity'] print " BEST ORIGINAL RANDOM MEANS (RESTART #",str(stats['bestRestartID'])+"):", stats['bestRandomCenters'] print " BEST CLUSTERING (RESTART #",str(stats['bestRestartID'])+"):" for clusterID, cluster in enumerate(stats['bestClustering']): print " ", clusterID, cluster def calculatePurity(self, clustering): purity = 0. numDocuments = 0. for cluster in clustering: classesInCluster = [] for documentID in cluster: numDocuments += 1 classesInCluster += [self.documents[documentID].cluster] mostPopularClass = max(set(classesInCluster), key=classesInCluster.count) purity += classesInCluster.count(mostPopularClass) purity = purity / numDocuments return purity def runKMeans(self, k, numRestarts = 1000, maxIterations = 10): ''' there are lots of stats we want to keep track of here. so we're going to store them in a dictionary for easy returning. ''' stats = {} stats['k'] = k ''' we only want to return the residual sum of scores value for the best clustering we find, so we keep track of it here. ''' stats['bestRSS'] = float('inf') totalIterationCount = 0 ''' some randomness isn't so great, so we try it many many times! ''' for restart in range(numRestarts): if self.debug: print "-------------------------------------------------------------------" print "k =",k print "RANDOM RESTART #"+str(restart) ''' find k random centers ''' randomCenters, randomNumbers = self.getKRandomCenters(k) centers = randomCenters docsInCluster = [] oldDocsInCluster = [] oldRSS = float('inf') iterationCount = 0 converged = False while not converged and iterationCount < maxIterations: iterationCount += 1. ''' create a data structure to find which documents belong to which cluster ''' docsInCluster=[[] for x in range(k)] #this will hold a list of k lists ''' for each document, find the center that's closest ''' for documentID, document in enumerate(self.dVectors): closestCluster = self.findClosestCenterToDocument(document, centers) docsInCluster[closestCluster] += [documentID] ''' calculate the residual sum of squares ''' rss = self.residualSumOfSquares(centers, docsInCluster) if rss < stats['bestRSS']: stats['bestRSS'] = rss stats['bestClustering'] = docsInCluster stats['bestRandomCenters'] = randomNumbers stats['bestRestartID'] = restart ''' for each cluster, move the cluster center ''' newcenters = [] for docs in docsInCluster: newcenters += [self.findNewClusterCenter(docs)] ''' update centers to be the new list we just created ''' centers = list(newcenters) ''' print cluster assignments (for debugging)''' if self.debug: print "FOR ITERATION #"+str(iterationCount)+", the RSS is:",rss,"and the distribution is:" for clusterID, cluster in enumerate(docsInCluster): print " ", clusterID, cluster converged = self.converged(docsInCluster,oldDocsInCluster) if rss > oldRSS: if(self.debug): print 'RSS INCREASED THIS ITERATION, ROLLING BACK TO OLD RSS' converged = True print "Center that's closest to zero:",self.findClosestCenterToDocument({}, centers) oldDocsInCluster = docsInCluster oldRSS = rss totalIterationCount += iterationCount stats['avgIterations'] = totalIterationCount/numRestarts stats['bestPurity'] = self.calculatePurity(stats['bestClustering']) return stats def residualSumOfSquares(self, centers, clusterMembers): ''' loop through each cluster ''' rss = 0. for clusterID in range(len(centers)): ''' increment rss by the distances between a document and its cluster center ''' for documentID in clusterMembers[clusterID]: if self.distanceMetric == 'euclidean': rss += self.distanceBetween(self.dVectors[documentID],centers[clusterID])**2 elif self.distanceMetric == 'cosine': rss += self.cosineScore(self.dVectors[documentID],centers[clusterID])**2 else: print 'you have not defined a distance metric' return rss def findNewClusterCenter(self, clusterMembers): ''' create dictionary to contain the new list of all words in the cluster ''' newcenter = {} newcenterCounts = {} ''' loop through each member of the cluster ''' for documentID in clusterMembers: document = self.dVectors[documentID] ''' loop through term word in the document ''' for term in document: ''' add the term's value in the document to its corresponding term in newcenter ''' newcenter[term] = newcenter.get(term, 0) + document[term] #newcenterCounts[term] = newcenterCounts.get(term, 0.) + 1 #newcenter[term] = min(newcenter.get(term, float('inf')), document[term]) ''' normalize each term by the total number of items in the cluster ''' for term in newcenter: newcenter[term] = float(newcenter[term])/len(clusterMembers) #newcenter[term] = float(newcenter[term])/newcenterCounts[term] #print term, newcenter[term] return newcenter def converged(self, clustering1, clustering2): if len(clustering1) != len(clustering2): return False for clusterID in range(len(clustering2)): #could have used either if not set(clustering1[clusterID]) == set(clustering2[clusterID]): return False return True def findClosestCenterToDocument(self, document, centers): ''' calculate the k distances ''' distances = [] for center in centers: if self.distanceMetric == 'euclidean': distances += [self.distanceBetween(document,center)] elif self.distanceMetric == 'cosine': distances += [self.cosineScore(document, center)] else: print 'you have not defined a distance metric' ''' return the index of the minimum distance ''' return distances.index(min(distances)) def getKRandomCenters(self, k): ''' find k unique numbers (will be indexes of our centers) ''' kRandomNumbers = [] while len(kRandomNumbers)<k: randomNumber = random.randint(0,len(self.dVectors)-1) if randomNumber not in kRandomNumbers: kRandomNumbers += [randomNumber] if self.debug: print 'ORIGINAL RANDOM MEANS:', kRandomNumbers ''' get those k random items into array (in increasing order) ''' kRandomCenters = [] for index in sorted(kRandomNumbers)[::-1]: kRandomCenters += [self.dVectors[index]] return kRandomCenters, kRandomNumbers def distanceBetween(self,postings1,postings2): #squared euclidian distance distance=0. for i in set(postings1.keys() + postings2.keys()): value1 = 0. value2 = 0. #print i, if i in postings1: value1 = postings1[i] if i in postings2: value2 = postings2[i] # get difference, square it, distance += abs(value1 - value2)**2 #print distance return distance def cosineScore(self, document1, document2): # calculate dot product dotProduct = self.getDotProduct(document1,document2) # get magnitudes magnitudes = self.calculateMagnitudeOfVector(document1) * self.calculateMagnitudeOfVector(document2) if magnitudes == 0: magnitudes = 0 + sys.float_info.epsilon #the smallest possible value. avoid divide by zero error return 1 - (dotProduct/magnitudes) def calculateMagnitudeOfVector(self, vector): mag = 0. for term in vector: mag += vector[term]**2 mag = math.sqrt(mag) return mag def getDotProduct(self, document1, document2): dotProduct = 0.0 #print postingsDoc #print postingsQuery for term in set(document1.keys() + document2.keys()): d1 = 0 d2 = 0 if term in document1 and term in document2: d1 = document1[term] d2 = document2[term] dotProduct += d1*d2 return dotProduct def getTFIDF(self, posting): tf = posting.getTF() idf = self.index.getIDF(posting.getTerm()) ''' if self.debug: print 'tfidf for', posting.getTerm() print 'TF: ', tf print 'DF: ', self.index.getTerm(posting.getTerm()).getDocumentFrequency() print 'IDF: ', idf print 'TFIDF:', tf*idf ''' #return tf*idf #* 10 # multiply by some number because values < 1 square differently... just wanted to exaggerate the distances return tf #return random.randint(0,50) def sortDictionary(self, dictionary): def reverse_numeric(x, y): if y - x > 0: return 1 if y-x < 0: return -1 else: return 0 return collections.OrderedDict(sorted(dictionary.items(), key=itemgetter(1))) def reduceDimensionality(self, dictionary, numItemsToKeep=20): sorteddict = self.sortDictionary(dictionary) newdict = {} for i in range(numItemsToKeep): term = sorteddict.popitem() newdict[term[0]] = term[1] #term[0] is the term and term[1] is the tfidf value return newdict