def __call__(self): """ Do an iteration of the Lloyd algorithm. """ centers = np.array(list(self.gs.init_strategy( self.gs.points, self.gs.nclusters))) sqdists = kmeans.get_point_center_sqdists(self.gs.points, centers) labels = kmeans.get_labels(sqdists) wcss, labels = kmeans.lloyd(self.gs.points, labels) if (self.best_wcss is None) or (wcss < self.best_wcss): self.best_wcss = wcss self.best_labels = labels # do not stop return False
def get_response_content(fs): # define constants nrestarts = 10 # read the input rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data points = get_rtable_info(rtable, "annotation", fs.axes) # do the clustering codebook, distortion = cluster.vq.kmeans(points, fs.k, iter=nrestarts, thresh=1e-9) sqdists = kmeans.get_point_center_sqdists(points, codebook) labels = kmeans.get_labels_without_cluster_removal(sqdists) wgss = kmeans.get_wcss(sqdists, labels) norms = [np.linalg.norm(p - codebook[g]) for p, g in zip(points, labels)] redistortion = np.mean(norms) # create the response out = StringIO() print >> out, "scipy distortion:", distortion print >> out, "recomputed distortion:", redistortion print >> out, "wgss:", wgss # return the response return out.getvalue()
def get_response_content(fs): # define constants nrestarts = 10 # read the input rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data points = get_rtable_info(rtable, 'annotation', fs.axes) # do the clustering codebook, distortion = cluster.vq.kmeans( points, fs.k, iter=nrestarts, thresh=1e-9) sqdists = kmeans.get_point_center_sqdists(points, codebook) labels = kmeans.get_labels_without_cluster_removal(sqdists) wgss = kmeans.get_wcss(sqdists, labels) norms = [np.linalg.norm(p-codebook[g]) for p, g in zip(points, labels)] redistortion = np.mean(norms) # create the response out = StringIO() print >> out, 'scipy distortion:', distortion print >> out, 'recomputed distortion:', redistortion print >> out, 'wgss:', wgss # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i + 1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: msg_a = 'expected the axis column %s ' % h msg_b = 'to be numeric' raise ValueError(msg_a + msg_b) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # precompute some stuff allmeandist = kmeans.get_allmeandist(points) nrestarts = 10 nseconds = 2 tm = time.time() n = len(points) wgss_list = [] # neg because both items in the pair are used for sorting neg_calinski_k_pairs = [] # look for the best calinski index in a small amount of time k = 2 while True: codebook, distortion = cluster.vq.kmeans(points, k, iter=nrestarts, thresh=1e-9) sqdists = kmeans.get_point_center_sqdists(points, codebook) labels = kmeans.get_labels_without_cluster_removal(sqdists) wgss = kmeans.get_wcss(sqdists, labels) bgss = allmeandist - wgss calinski = kmeans.get_calinski_index(bgss, wgss, k, n) k_unique = len(set(labels)) neg_calinski_k_pairs.append((-calinski, k_unique)) wgss_list.append(wgss) if time.time() - tm > nseconds: break if k == n - 1: break k += 1 max_k = k best_neg_calinski, best_k = min(neg_calinski_k_pairs) best_calinski = -best_neg_calinski # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k print >> out, 'searched 2 <= k <= %d clusters' % max_k print >> out, '%.2f seconds' % (time.time() - tm) if fs.verbose: print >> out print >> out, '(k_unique, wgss, calinski):' for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs): neg_calinski, k_unique = neg_calinski_k_pair calinski = -neg_calinski row = [k_unique, wgss, calinski] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i+1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: msg_a = 'expected the axis column %s ' % h msg_b = 'to be numeric' raise ValueError(msg_a + msg_b) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # precompute some stuff allmeandist = kmeans.get_allmeandist(points) nrestarts = 10 nseconds = 2 tm = time.time() n = len(points) wgss_list = [] # neg because both items in the pair are used for sorting neg_calinski_k_pairs = [] # look for the best calinski index in a small amount of time k = 2 while True: codebook, distortion = cluster.vq.kmeans( points, k, iter=nrestarts, thresh=1e-9) sqdists = kmeans.get_point_center_sqdists(points, codebook) labels = kmeans.get_labels_without_cluster_removal(sqdists) wgss = kmeans.get_wcss(sqdists, labels) bgss = allmeandist - wgss calinski = kmeans.get_calinski_index(bgss, wgss, k, n) k_unique = len(set(labels)) neg_calinski_k_pairs.append((-calinski, k_unique)) wgss_list.append(wgss) if time.time() - tm > nseconds: break if k == n-1: break k += 1 max_k = k best_neg_calinski, best_k = min(neg_calinski_k_pairs) best_calinski = -best_neg_calinski # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k print >> out, 'searched 2 <= k <= %d clusters' % max_k print >> out, '%.2f seconds' % (time.time() - tm) if fs.verbose: print >> out print >> out, '(k_unique, wgss, calinski):' for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs): neg_calinski, k_unique = neg_calinski_k_pair calinski = -neg_calinski row = [k_unique, wgss, calinski] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()