def cluster(self, name='kmeans', min_clusters=2, max_clusters=None, initial='k-means++'):
     if min_clusters < 2:
         print 'Must have at least 2 clusters'
         min_clusters = 2
     if name != 'kmeans' and name != 'kmedoids':
         print 'Invalid clustering algorithm name. Defaulting to k-means'
         name='kmeans'
     if max_clusters == None:
         max_clusters = len(self.data)/2.0
         
     max = -2
     num = 0
     labely = []
     r = max_clusters - min_clusters+1
     if name == 'kmedoids':
         for i in range(r):
             num_clusters = i + min_clusters
             cl = kmedoids(self.points, num_clusters)
             self.labels = [0] * len(self.data)
             cluster = -1
             for key in cl[2]:
                 cluster += 1
                 for j in cl[2][key]:
                     self.labels[j] = cluster
             sil = metrics.silhouette_score(numpy.array(self.points), numpy.array(self.labels))
             if sil > max:
                 max = sil
                 num = num_clusters
                 labely = self.labels
     elif name == 'kmeans':
         for i in range(r):
             num_clusters = i + min_clusters
             cl = KMeans(num_clusters, random_state=8, init=initial)
             cl.fit(self.points)
             self.labels = cl.labels_
             sil = metrics.silhouette_score(numpy.array(self.points), self.labels)
             if sil > max:
                 max = sil
                 num = num_clusters
                 labely = self.labels
     self.sil = max
     self.clusters = num
     self.labels = labely
 def cluster(self, name='kmeans', min_clusters=2, max_clusters=None):
     logging.debug("min_clusters = %s, max_clusters = %s, len(self.points) = %s" % 
         (min_clusters, max_clusters, len(self.points)))
     if min_clusters < 2:
         logging.debug("min_clusters < 2, setting min_clusters = 2")
         min_clusters = 2
     if min_clusters > len(self.points):
         sys.stderr.write('Maximum number of clusters is the number of data points.\n')
         min_clusters = len(self.points)-1
     if max_clusters == None:
         logging.debug("max_clusters is None, setting max_clusters = %d" % (len(self.points) - 1))
         max_clusters = len(self.points)-1
     if max_clusters < 2:
         sys.stderr.write('Must have at least 2 clusters\n')
         max_clusters = 2
     if max_clusters >= len(self.points):
         logging.debug("max_clusters >= len(self.points), setting max_clusters = %d" % (len(self.points) - 1))
         max_clusters = len(self.points)-1
     if max_clusters < min_clusters:
         raise ValueError('Please provide a valid range of cluster sizes')
     if name != 'kmeans' and name != 'kmedoids':
         logging.debug('Invalid clustering algorithm name. Defaulting to k-means')
         name='kmeans'
     if not self.data:
         self.sil = None
         self.clusters = 0
         self.labels = []
         return []
     max = -2
     num = 0
     labely = []
     r = max_clusters - min_clusters+1
     if name == 'kmedoids':
         for i in range(r):
             num_clusters = i + min_clusters
             logging.debug('testing %s clusters' % str(num_clusters))
             cl = kmedoids(self.points, num_clusters)
             self.labels = [0] * len(self.data)
             cluster = -1
             for key in cl[2]:
                 cluster += 1
                 for j in cl[2][key]:
                     self.labels[j] = cluster
             sil = metrics.silhouette_score(numpy.array(self.points), numpy.array(self.labels))
             if sil > max:
                 max = sil
                 num = num_clusters
                 labely = self.labels
     elif name == 'kmeans':
         import warnings
         for i in range(r):
             num_clusters = i + min_clusters
             cl = KMeans(num_clusters, random_state=8)
             cl.fit(self.points)
             self.labels = cl.labels_
             warnings.filterwarnings("ignore")                
             sil = metrics.silhouette_score(numpy.array(self.points), self.labels)
             if sil > max:
                 max = sil
                 num = num_clusters
                 labely = self.labels
     self.sil = max
     self.clusters = num
     self.labels = labely
     self.labels = list(self.labels)
     return self.labels
Beispiel #3
0
 def cluster(self, name='kmeans', min_clusters=2, max_clusters=None):
     logging.debug(
         "min_clusters = %s, max_clusters = %s, len(self.points) = %s" %
         (min_clusters, max_clusters, len(self.points)))
     if min_clusters < 2:
         logging.debug("min_clusters < 2, setting min_clusters = 2")
         min_clusters = 2
     if min_clusters > len(self.points):
         sys.stderr.write(
             'Minimum number of clusters %d is greater than the number of data points %d.\n'
             % (min_clusters, len(self.points)))
         min_clusters = len(self.points) - 1
     if max_clusters == None:
         logging.debug("max_clusters is None, setting max_clusters = %d" %
                       (len(self.points) - 1))
         max_clusters = len(self.points) - 1
     if max_clusters < 2:
         sys.stderr.write('Must have at least 2 clusters\n')
         max_clusters = 2
     if max_clusters >= len(self.points):
         logging.debug(
             "max_clusters >= len(self.points), setting max_clusters = %d" %
             (len(self.points) - 1))
         max_clusters = len(self.points) - 1
     if max_clusters < min_clusters:
         raise ValueError('Please provide a valid range of cluster sizes')
     if name != 'kmeans' and name != 'kmedoids':
         logging.debug(
             'Invalid clustering algorithm name. Defaulting to k-means')
         name = 'kmeans'
     if not self.data:
         self.sil = None
         self.clusters = 0
         self.labels = []
         return []
     max = -2
     num = 0
     labely = []
     r = max_clusters - min_clusters + 1
     if name == 'kmedoids':
         for i in range(r):
             num_clusters = i + min_clusters
             logging.debug('testing %s clusters' % str(num_clusters))
             cl = kmedoids(self.points, num_clusters)
             self.labels = [0] * len(self.data)
             cluster = -1
             for key in cl[2]:
                 cluster += 1
                 for j in cl[2][key]:
                     self.labels[j] = cluster
             sil = metrics.silhouette_score(numpy.array(self.points),
                                            numpy.array(self.labels))
             if sil > max:
                 max = sil
                 num = num_clusters
                 labely = self.labels
     elif name == 'kmeans':
         import warnings
         for i in range(r):
             num_clusters = i + min_clusters
             if num_clusters == 0:
                 continue
             cl = KMeans(num_clusters, random_state=8)
             cl.fit(self.points)
             self.labels = cl.labels_
             warnings.filterwarnings("ignore")
             sil = metrics.silhouette_score(numpy.array(self.points),
                                            self.labels)
             if sil > max:
                 max = sil
                 num = num_clusters
                 labely = self.labels
     self.sil = max
     self.clusters = num
     self.labels = labely
     self.labels = list(self.labels)
     return self.labels