def main(args): """ @param args: from argparse """ # get some state that will not change between k-means restarts with open(args.table_filename) as fin: rtable = RUtil.RTable(fin) points = get_rtable_info(rtable, args.annotation, args.axes) init_strategy = kmeans.InitStrategy().string_to_function(args.kmeans_init) gs = GlobalState(rtable, points, args.annotation, args.k, init_strategy) # go until iteration is stopped for some reason print combobreaker.run_callable( ClusterState(gs), args.nseconds, args.nrestarts)
def get_form(): """ @return: the body of a form """ form_objects = [ Form.MultiLine('table', 'R table', g_default), Form.Sequence('axes', 'column labels of Euclidean axes', ('pc1', 'pc2', 'pc3')), Form.Integer('k', 'maximum number of clusters', 2, low=2), Form.SingleLine('annotation', 'header of added column', 'cluster'), kmeans.InitStrategy()] return form_objects
def get_form(): """ @return: the body of a form """ form_objects = [ Form.MultiLine('table', 'R table', g_default), Form.Sequence('axes', 'column labels of Euclidean axes', ('pc1', 'pc2', 'pc3')), kmeans.InitStrategy(), Form.CheckGroup( 'options', 'more options', [Form.CheckItem('verbose', 'show calinski index values', True)]) ] return form_objects
def get_response_content(fs): rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data points = get_rtable_info(rtable, fs.annotation, fs.axes) # do the clustering nrestarts = 10 init_strategy = kmeans.InitStrategy().string_to_function(fs.kmeans_init) wcss, labels = kmeans.lloyd_with_restarts( points, fs.k, nrestarts, init_strategy) # get the response lines = ['\t'.join(header_row + [fs.annotation])] for i, (label, data_row) in enumerate(zip(labels, data_rows)): row = data_row + [str(label)] lines.append('\t'.join(row)) # return the response return '\n'.join(lines) + '\n'
@param args: from argparse """ # get some state that will not change between k-means restarts with open(args.table_filename) as fin: rtable = RUtil.RTable(fin) points = get_rtable_info(rtable, args.annotation, args.axes) init_strategy = kmeans.InitStrategy().string_to_function(args.kmeans_init) gs = GlobalState(rtable, points, args.annotation, args.k, init_strategy) # go until iteration is stopped for some reason print combobreaker.run_callable( ClusterState(gs), args.nseconds, args.nrestarts) if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--table_filename', required=True, help='R table filename') parser.add_argument('--axes', required=True, type=moretypes.whitespace_separated_sequence, help='column labels of Euclidean axes') parser.add_argument('--k', type=moretypes.int_ge(2), required=True, help='target number of clusters') parser.add_argument('--annotation', default='cluster', help='header of added column') parser.add_argument('--nrestarts', type=moretypes.positive_integer, help='restart the k-means iterative refinement this many times') parser.add_argument('--nseconds', type=moretypes.positive_float, help='run for this many seconds') kmeans.InitStrategy().add_argument(parser) main(parser.parse_args())
def get_response_content(fs): # get the initialization strategy init_strategy = kmeans.InitStrategy().string_to_function(fs.kmeans_init) # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i + 1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: msg_a = 'expected the axis column %s ' % h msg_b = 'to be numeric' raise ValueError(msg_a + msg_b) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # precompute some stuff allmeandist = kmeans.get_allmeandist(points) nrestarts = 10 nseconds = 2 tm = time.time() n = len(points) wgss_list = [] # neg because both items in the pair are used for sorting neg_calinski_k_pairs = [] # look for the best calinski index in a small amount of time k = 2 while True: wgss, labels = kmeans.lloyd_with_restarts(points, k, nrestarts, init_strategy) bgss = allmeandist - wgss calinski = kmeans.get_calinski_index(bgss, wgss, k, n) k_unique = len(set(labels)) neg_calinski_k_pairs.append((-calinski, k_unique)) wgss_list.append(wgss) if time.time() - tm > nseconds: break if k == n - 1: break k += 1 max_k = k best_neg_calinski, best_k = min(neg_calinski_k_pairs) best_calinski = -best_neg_calinski # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k print >> out, 'searched 2 <= k <= %d clusters' % max_k print >> out, '%.2f seconds' % (time.time() - tm) if fs.verbose: print >> out print >> out, '(k_unique, wgss, calinski):' for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs): neg_calinski, k_unique = neg_calinski_k_pair calinski = -neg_calinski row = [k_unique, wgss, calinski] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()