Esempio n. 1
0
def main(args):
    """
    @param args: from argparse
    """
    # get some state that will not change between k-means restarts
    with open(args.table_filename) as fin:
        rtable = RUtil.RTable(fin)
    points = get_rtable_info(rtable, args.annotation, args.axes)
    init_strategy = kmeans.InitStrategy().string_to_function(args.kmeans_init)
    gs = GlobalState(rtable, points, args.annotation, args.k, init_strategy)
    # go until iteration is stopped for some reason
    print combobreaker.run_callable(
            ClusterState(gs), args.nseconds, args.nrestarts)
Esempio n. 2
0
def get_form():
    """
    @return: the body of a form
    """
    form_objects = [
            Form.MultiLine('table', 'R table', g_default),
            Form.Sequence('axes', 'column labels of Euclidean axes',
                ('pc1', 'pc2', 'pc3')),
            Form.Integer('k', 'maximum number of clusters',
                2, low=2),
            Form.SingleLine('annotation', 'header of added column',
                'cluster'),
            kmeans.InitStrategy()]
    return form_objects
Esempio n. 3
0
def get_form():
    """
    @return: the body of a form
    """
    form_objects = [
        Form.MultiLine('table', 'R table', g_default),
        Form.Sequence('axes', 'column labels of Euclidean axes',
                      ('pc1', 'pc2', 'pc3')),
        kmeans.InitStrategy(),
        Form.CheckGroup(
            'options', 'more options',
            [Form.CheckItem('verbose', 'show calinski index values', True)])
    ]
    return form_objects
Esempio n. 4
0
def get_response_content(fs):
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    points = get_rtable_info(rtable, fs.annotation, fs.axes)
    # do the clustering
    nrestarts = 10
    init_strategy = kmeans.InitStrategy().string_to_function(fs.kmeans_init)
    wcss, labels = kmeans.lloyd_with_restarts(
            points, fs.k, nrestarts, init_strategy)
    # get the response
    lines = ['\t'.join(header_row + [fs.annotation])]
    for i, (label, data_row) in enumerate(zip(labels, data_rows)):
        row = data_row + [str(label)]
        lines.append('\t'.join(row))
    # return the response
    return '\n'.join(lines) + '\n'
Esempio n. 5
0
    @param args: from argparse
    """
    # get some state that will not change between k-means restarts
    with open(args.table_filename) as fin:
        rtable = RUtil.RTable(fin)
    points = get_rtable_info(rtable, args.annotation, args.axes)
    init_strategy = kmeans.InitStrategy().string_to_function(args.kmeans_init)
    gs = GlobalState(rtable, points, args.annotation, args.k, init_strategy)
    # go until iteration is stopped for some reason
    print combobreaker.run_callable(
            ClusterState(gs), args.nseconds, args.nrestarts)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('--table_filename', required=True,
            help='R table filename')
    parser.add_argument('--axes', required=True,
            type=moretypes.whitespace_separated_sequence,
            help='column labels of Euclidean axes')
    parser.add_argument('--k', type=moretypes.int_ge(2), required=True,
            help='target number of clusters')
    parser.add_argument('--annotation', default='cluster',
            help='header of added column')
    parser.add_argument('--nrestarts', type=moretypes.positive_integer,
            help='restart the k-means iterative refinement this many times')
    parser.add_argument('--nseconds', type=moretypes.positive_float,
            help='run for this many seconds')
    kmeans.InitStrategy().add_argument(parser)
    main(parser.parse_args())

Esempio n. 6
0
def get_response_content(fs):
    # get the initialization strategy
    init_strategy = kmeans.InitStrategy().string_to_function(fs.kmeans_init)
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i + 1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            msg_a = 'expected the axis column %s ' % h
            msg_b = 'to be numeric'
            raise ValueError(msg_a + msg_b)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # precompute some stuff
    allmeandist = kmeans.get_allmeandist(points)
    nrestarts = 10
    nseconds = 2
    tm = time.time()
    n = len(points)
    wgss_list = []
    # neg because both items in the pair are used for sorting
    neg_calinski_k_pairs = []
    # look for the best calinski index in a small amount of time
    k = 2
    while True:
        wgss, labels = kmeans.lloyd_with_restarts(points, k, nrestarts,
                                                  init_strategy)
        bgss = allmeandist - wgss
        calinski = kmeans.get_calinski_index(bgss, wgss, k, n)
        k_unique = len(set(labels))
        neg_calinski_k_pairs.append((-calinski, k_unique))
        wgss_list.append(wgss)
        if time.time() - tm > nseconds:
            break
        if k == n - 1:
            break
        k += 1
    max_k = k
    best_neg_calinski, best_k = min(neg_calinski_k_pairs)
    best_calinski = -best_neg_calinski
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    print >> out, 'searched 2 <= k <= %d clusters' % max_k
    print >> out, '%.2f seconds' % (time.time() - tm)
    if fs.verbose:
        print >> out
        print >> out, '(k_unique, wgss, calinski):'
        for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs):
            neg_calinski, k_unique = neg_calinski_k_pair
            calinski = -neg_calinski
            row = [k_unique, wgss, calinski]
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()