def get_nclusters_logw_pairs(extents, npoints): """ Get sample statistics by agglomeratively clustering random points. These sample statistics will be used for a null distribution. @param extents: the length of each axis of the hypercube @param npoints: sample this many points at a time @return: (nclusters, logw) pairs for a single sampling of points """ # sample the points pointlist = [] for i in range(npoints): p = [random.uniform(0, x) for x in extents] pointlist.append(p) points = np.array(pointlist) # do the clustering, recording the within group sum of squares nclusters_wgss_pairs = [] allmeandist = kmeans.get_allmeandist(points) cluster_map = agglom.get_initial_cluster_map(points) b_ssd_map = agglom.get_initial_b_ssd_map(points) w_ssd_map = agglom.get_initial_w_ssd_map(points) q = agglom.get_initial_queue(b_ssd_map) while len(cluster_map) > 2: pair = agglom.get_pair_fast(cluster_map, q) agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair) indices = cluster_map.keys() wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices) nclusters_wgss_pairs.append((len(cluster_map), wgss)) return nclusters_wgss_pairs
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i + 1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: msg_a = 'expected the axis column %s ' % h msg_b = 'to be numeric' raise ValueError(msg_a + msg_b) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # precompute some stuff allmeandist = kmeans.get_allmeandist(points) nrestarts = 10 nseconds = 2 tm = time.time() n = len(points) wgss_list = [] # neg because both items in the pair are used for sorting neg_calinski_k_pairs = [] # look for the best calinski index in a small amount of time k = 2 while True: codebook, distortion = cluster.vq.kmeans(points, k, iter=nrestarts, thresh=1e-9) sqdists = kmeans.get_point_center_sqdists(points, codebook) labels = kmeans.get_labels_without_cluster_removal(sqdists) wgss = kmeans.get_wcss(sqdists, labels) bgss = allmeandist - wgss calinski = kmeans.get_calinski_index(bgss, wgss, k, n) k_unique = len(set(labels)) neg_calinski_k_pairs.append((-calinski, k_unique)) wgss_list.append(wgss) if time.time() - tm > nseconds: break if k == n - 1: break k += 1 max_k = k best_neg_calinski, best_k = min(neg_calinski_k_pairs) best_calinski = -best_neg_calinski # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k print >> out, 'searched 2 <= k <= %d clusters' % max_k print >> out, '%.2f seconds' % (time.time() - tm) if fs.verbose: print >> out print >> out, '(k_unique, wgss, calinski):' for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs): neg_calinski, k_unique = neg_calinski_k_pair calinski = -neg_calinski row = [k_unique, wgss, calinski] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i+1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: msg_a = 'expected the axis column %s ' % h msg_b = 'to be numeric' raise ValueError(msg_a + msg_b) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # precompute some stuff allmeandist = kmeans.get_allmeandist(points) nrestarts = 10 nseconds = 2 tm = time.time() n = len(points) wgss_list = [] # neg because both items in the pair are used for sorting neg_calinski_k_pairs = [] # look for the best calinski index in a small amount of time k = 2 while True: codebook, distortion = cluster.vq.kmeans( points, k, iter=nrestarts, thresh=1e-9) sqdists = kmeans.get_point_center_sqdists(points, codebook) labels = kmeans.get_labels_without_cluster_removal(sqdists) wgss = kmeans.get_wcss(sqdists, labels) bgss = allmeandist - wgss calinski = kmeans.get_calinski_index(bgss, wgss, k, n) k_unique = len(set(labels)) neg_calinski_k_pairs.append((-calinski, k_unique)) wgss_list.append(wgss) if time.time() - tm > nseconds: break if k == n-1: break k += 1 max_k = k best_neg_calinski, best_k = min(neg_calinski_k_pairs) best_calinski = -best_neg_calinski # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k print >> out, 'searched 2 <= k <= %d clusters' % max_k print >> out, '%.2f seconds' % (time.time() - tm) if fs.verbose: print >> out print >> out, '(k_unique, wgss, calinski):' for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs): neg_calinski, k_unique = neg_calinski_k_pair calinski = -neg_calinski row = [k_unique, wgss, calinski] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i + 1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError('expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # do the clustering while computing the wgss at each merge cluster_counts = [] wgss_values = [] allmeandist = kmeans.get_allmeandist(points) cluster_map = agglom.get_initial_cluster_map(points) w_ssd_map = agglom.get_initial_w_ssd_map(points) b_ssd_map = agglom.get_initial_b_ssd_map(points) q = agglom.get_initial_queue(b_ssd_map) while len(cluster_map) > 2: # do an agglomeration step pair = agglom.get_pair_fast(cluster_map, q) agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair) # compute the within group sum of squares indices = cluster_map.keys() wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices) # compute the between group sum of squares bgss = allmeandist - wgss # append to the lists cluster_counts.append(len(cluster_map)) wgss_values.append(wgss) # compute the log wgss values wlogs = np.log(wgss_values) # reverse the log values so that they are by increasing cluster size wlogs = list(reversed(wlogs)) # sample from the null distribution extents = np.max(points, axis=0) - np.min(points, axis=0) nclusters_list, expectations, thresholds = do_sampling( extents, len(points), fs.nsamples) # get the gaps gaps = np.array(expectations) - wlogs # Get the best cluster count according to the gap statistic. best_i = None criteria = [] for i, ip1 in iterutils.pairwise(range(len(nclusters_list))): k, kp1 = nclusters_list[i], nclusters_list[ip1] criterion = gaps[i] - gaps[ip1] + thresholds[ip1] criteria.append(criterion) if criterion > 0: if best_i is None: best_i = i best_k = nclusters_list[best_i] # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k if fs.verbose: print >> out print >> out, '(k, expected, observed, gap, threshold, criterion):' n = len(nclusters_list) for i, k in enumerate(nclusters_list): row = [k, expectations[i], wlogs[i], gaps[i], thresholds[i]] if i < n - 1: row += [criteria[i]] else: row += ['-'] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i+1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError( 'expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # do the clustering while computing the calinski index at each merge cluster_counts = [] wgss_values = [] neg_calinskis = [] allmeandist = kmeans.get_allmeandist(points) cluster_map = agglom.get_initial_cluster_map(points) w_ssd_map = agglom.get_initial_w_ssd_map(points) b_ssd_map = agglom.get_initial_b_ssd_map(points) q = agglom.get_initial_queue(b_ssd_map) while len(cluster_map) > 2: # do an agglomeration step pair = agglom.get_pair_fast(cluster_map, q) agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair) # compute the within group sum of squares indices = cluster_map.keys() wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices) # compute the between group sum of squares bgss = allmeandist - wgss # get the calinksi index n = len(points) k = len(cluster_map) numerator = bgss / float(k - 1) denominator = wgss / float(n - k) calinski = numerator / denominator # append to the lists cluster_counts.append(k) wgss_values.append(wgss) neg_calinskis.append(-calinski) # Get the best cluster count according to the calinski index. # Do this trickery with negs so that it breaks ties # using the smallest number of clusters. neg_calinksi, best_k = min(zip(neg_calinskis, cluster_counts)) # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k if fs.verbose: print >> out print >> out, '(k, wgss, calinski):' for k, wgss, neg_calinski in zip( cluster_counts, wgss_values, neg_calinskis): row = (k, wgss, -neg_calinski) print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i+1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError( 'expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # do the clustering while computing the wgss at each merge cluster_counts = [] wgss_values = [] allmeandist = kmeans.get_allmeandist(points) cluster_map = agglom.get_initial_cluster_map(points) w_ssd_map = agglom.get_initial_w_ssd_map(points) b_ssd_map = agglom.get_initial_b_ssd_map(points) q = agglom.get_initial_queue(b_ssd_map) while len(cluster_map) > 2: # do an agglomeration step pair = agglom.get_pair_fast(cluster_map, q) agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair) # compute the within group sum of squares indices = cluster_map.keys() wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices) # compute the between group sum of squares bgss = allmeandist - wgss # append to the lists cluster_counts.append(len(cluster_map)) wgss_values.append(wgss) # compute the log wgss values wlogs = np.log(wgss_values) # reverse the log values so that they are by increasing cluster size wlogs = list(reversed(wlogs)) # sample from the null distribution extents = np.max(points, axis=0) - np.min(points, axis=0) nclusters_list, expectations, thresholds = do_sampling( extents, len(points), fs.nsamples) # get the gaps gaps = np.array(expectations) - wlogs # Get the best cluster count according to the gap statistic. best_i = None criteria = [] for i, ip1 in iterutils.pairwise(range(len(nclusters_list))): k, kp1 = nclusters_list[i], nclusters_list[ip1] criterion = gaps[i] - gaps[ip1] + thresholds[ip1] criteria.append(criterion) if criterion > 0: if best_i is None: best_i = i best_k = nclusters_list[best_i] # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k if fs.verbose: print >> out print >> out, '(k, expected, observed, gap, threshold, criterion):' n = len(nclusters_list) for i, k in enumerate(nclusters_list): row = [k, expectations[i], wlogs[i], gaps[i], thresholds[i]] if i < n-1: row += [criteria[i]] else: row += ['-'] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i + 1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError('expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # do the clustering while computing the calinski index at each merge cluster_counts = [] wgss_values = [] neg_calinskis = [] allmeandist = kmeans.get_allmeandist(points) cluster_map = agglom.get_initial_cluster_map(points) w_ssd_map = agglom.get_initial_w_ssd_map(points) b_ssd_map = agglom.get_initial_b_ssd_map(points) q = agglom.get_initial_queue(b_ssd_map) while len(cluster_map) > 2: # do an agglomeration step pair = agglom.get_pair_fast(cluster_map, q) agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair) # compute the within group sum of squares indices = cluster_map.keys() wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices) # compute the between group sum of squares bgss = allmeandist - wgss # get the calinksi index n = len(points) k = len(cluster_map) numerator = bgss / float(k - 1) denominator = wgss / float(n - k) calinski = numerator / denominator # append to the lists cluster_counts.append(k) wgss_values.append(wgss) neg_calinskis.append(-calinski) # Get the best cluster count according to the calinski index. # Do this trickery with negs so that it breaks ties # using the smallest number of clusters. neg_calinksi, best_k = min(zip(neg_calinskis, cluster_counts)) # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k if fs.verbose: print >> out print >> out, '(k, wgss, calinski):' for k, wgss, neg_calinski in zip(cluster_counts, wgss_values, neg_calinskis): row = (k, wgss, -neg_calinski) print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()