def cluster(self, name='kmeans', min_clusters=2, max_clusters=None, initial='k-means++'): if min_clusters < 2: print 'Must have at least 2 clusters' min_clusters = 2 if name != 'kmeans' and name != 'kmedoids': print 'Invalid clustering algorithm name. Defaulting to k-means' name='kmeans' if max_clusters == None: max_clusters = len(self.data)/2.0 max = -2 num = 0 labely = [] r = max_clusters - min_clusters+1 if name == 'kmedoids': for i in range(r): num_clusters = i + min_clusters cl = kmedoids(self.points, num_clusters) self.labels = [0] * len(self.data) cluster = -1 for key in cl[2]: cluster += 1 for j in cl[2][key]: self.labels[j] = cluster sil = metrics.silhouette_score(numpy.array(self.points), numpy.array(self.labels)) if sil > max: max = sil num = num_clusters labely = self.labels elif name == 'kmeans': for i in range(r): num_clusters = i + min_clusters cl = KMeans(num_clusters, random_state=8, init=initial) cl.fit(self.points) self.labels = cl.labels_ sil = metrics.silhouette_score(numpy.array(self.points), self.labels) if sil > max: max = sil num = num_clusters labely = self.labels self.sil = max self.clusters = num self.labels = labely
def cluster(self, name='kmeans', min_clusters=2, max_clusters=None): logging.debug("min_clusters = %s, max_clusters = %s, len(self.points) = %s" % (min_clusters, max_clusters, len(self.points))) if min_clusters < 2: logging.debug("min_clusters < 2, setting min_clusters = 2") min_clusters = 2 if min_clusters > len(self.points): sys.stderr.write('Maximum number of clusters is the number of data points.\n') min_clusters = len(self.points)-1 if max_clusters == None: logging.debug("max_clusters is None, setting max_clusters = %d" % (len(self.points) - 1)) max_clusters = len(self.points)-1 if max_clusters < 2: sys.stderr.write('Must have at least 2 clusters\n') max_clusters = 2 if max_clusters >= len(self.points): logging.debug("max_clusters >= len(self.points), setting max_clusters = %d" % (len(self.points) - 1)) max_clusters = len(self.points)-1 if max_clusters < min_clusters: raise ValueError('Please provide a valid range of cluster sizes') if name != 'kmeans' and name != 'kmedoids': logging.debug('Invalid clustering algorithm name. Defaulting to k-means') name='kmeans' if not self.data: self.sil = None self.clusters = 0 self.labels = [] return [] max = -2 num = 0 labely = [] r = max_clusters - min_clusters+1 if name == 'kmedoids': for i in range(r): num_clusters = i + min_clusters logging.debug('testing %s clusters' % str(num_clusters)) cl = kmedoids(self.points, num_clusters) self.labels = [0] * len(self.data) cluster = -1 for key in cl[2]: cluster += 1 for j in cl[2][key]: self.labels[j] = cluster sil = metrics.silhouette_score(numpy.array(self.points), numpy.array(self.labels)) if sil > max: max = sil num = num_clusters labely = self.labels elif name == 'kmeans': import warnings for i in range(r): num_clusters = i + min_clusters cl = KMeans(num_clusters, random_state=8) cl.fit(self.points) self.labels = cl.labels_ warnings.filterwarnings("ignore") sil = metrics.silhouette_score(numpy.array(self.points), self.labels) if sil > max: max = sil num = num_clusters labely = self.labels self.sil = max self.clusters = num self.labels = labely self.labels = list(self.labels) return self.labels
def cluster(self, name='kmeans', min_clusters=2, max_clusters=None): logging.debug( "min_clusters = %s, max_clusters = %s, len(self.points) = %s" % (min_clusters, max_clusters, len(self.points))) if min_clusters < 2: logging.debug("min_clusters < 2, setting min_clusters = 2") min_clusters = 2 if min_clusters > len(self.points): sys.stderr.write( 'Minimum number of clusters %d is greater than the number of data points %d.\n' % (min_clusters, len(self.points))) min_clusters = len(self.points) - 1 if max_clusters == None: logging.debug("max_clusters is None, setting max_clusters = %d" % (len(self.points) - 1)) max_clusters = len(self.points) - 1 if max_clusters < 2: sys.stderr.write('Must have at least 2 clusters\n') max_clusters = 2 if max_clusters >= len(self.points): logging.debug( "max_clusters >= len(self.points), setting max_clusters = %d" % (len(self.points) - 1)) max_clusters = len(self.points) - 1 if max_clusters < min_clusters: raise ValueError('Please provide a valid range of cluster sizes') if name != 'kmeans' and name != 'kmedoids': logging.debug( 'Invalid clustering algorithm name. Defaulting to k-means') name = 'kmeans' if not self.data: self.sil = None self.clusters = 0 self.labels = [] return [] max = -2 num = 0 labely = [] r = max_clusters - min_clusters + 1 if name == 'kmedoids': for i in range(r): num_clusters = i + min_clusters logging.debug('testing %s clusters' % str(num_clusters)) cl = kmedoids(self.points, num_clusters) self.labels = [0] * len(self.data) cluster = -1 for key in cl[2]: cluster += 1 for j in cl[2][key]: self.labels[j] = cluster sil = metrics.silhouette_score(numpy.array(self.points), numpy.array(self.labels)) if sil > max: max = sil num = num_clusters labely = self.labels elif name == 'kmeans': import warnings for i in range(r): num_clusters = i + min_clusters if num_clusters == 0: continue cl = KMeans(num_clusters, random_state=8) cl.fit(self.points) self.labels = cl.labels_ warnings.filterwarnings("ignore") sil = metrics.silhouette_score(numpy.array(self.points), self.labels) if sil > max: max = sil num = num_clusters labely = self.labels self.sil = max self.clusters = num self.labels = labely self.labels = list(self.labels) return self.labels