def kmean(self,k,cutoff): initials = random.sample(self.files,k) clusters = [] # print 'Iteration ==> ',k for p in initials: clusters.append(Cluster(self.id,files=[p])) while True: lists = [] for c in clusters: lists.append([]) for p in self.files: smallest_distance = utils.getDistance(p.vector,clusters[0].centroid) index = 0 for i in range(len(clusters[1:])): distance = utils.getDistance(p.vector, clusters[i+1].centroid) if distance < smallest_distance: smallest_distance = distance index = i+1 # Add this Point to that Cluster's corresponding list lists[index].append(p) # Update each Cluster with the corresponding list # Record the biggest centroid shift for any Cluster biggest_shift = 0.0 for i in range(len(clusters)): shift = clusters[i].addfiles(lists[i]) biggest_shift = max(biggest_shift, shift) # If the biggest centroid shift is less than the cutoff, stop if biggest_shift < cutoff: break # Return the list of Clusters # print 'Total clusters => ',len(clusters) for cl in clusters: cl.centroid = cl.calculateCentroid() cl.calculateRadius() #print '\n-------\n',cl,cl.files,cl.radius,len(cl.files) return clusters
def calculateRadius(self): max = -1 for p in self.files: dist = utils.getDistance(self.centroid, p.vector) if dist > max: max = dist self.radius = max return max
def calculateRadius(self): max = -1 for p in self.files: dist = utils.getDistance(self.centroid,p.vector) if dist > max: max = dist self.radius = max return max
def getScore(self,vector,centroid,clusterdata,alldata): dist = utils.getDistance(vector,centroid) return clusterdata/(dist*alldata)