Example #1
0
def get_nclusters_logw_pairs(extents, npoints):
    """
    Get sample statistics by agglomeratively clustering random points.
    These sample statistics will be used for a null distribution.
    @param extents: the length of each axis of the hypercube
    @param npoints: sample this many points at a time
    @return: (nclusters, logw) pairs for a single sampling of points
    """
    # sample the points
    pointlist = []
    for i in range(npoints):
        p = [random.uniform(0, x) for x in extents]
        pointlist.append(p)
    points = np.array(pointlist)
    # do the clustering, recording the within group sum of squares
    nclusters_wgss_pairs = []
    allmeandist = kmeans.get_allmeandist(points)
    cluster_map = agglom.get_initial_cluster_map(points)
    b_ssd_map = agglom.get_initial_b_ssd_map(points)
    w_ssd_map = agglom.get_initial_w_ssd_map(points)
    q = agglom.get_initial_queue(b_ssd_map)
    while len(cluster_map) > 2:
        pair = agglom.get_pair_fast(cluster_map, q)
        agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair)
        indices = cluster_map.keys()
        wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices)
        nclusters_wgss_pairs.append((len(cluster_map), wgss))
    return nclusters_wgss_pairs
Example #2
0
def get_nclusters_logw_pairs(extents, npoints):
    """
    Get sample statistics by agglomeratively clustering random points.
    These sample statistics will be used for a null distribution.
    @param extents: the length of each axis of the hypercube
    @param npoints: sample this many points at a time
    @return: (nclusters, logw) pairs for a single sampling of points
    """
    # sample the points
    pointlist = []
    for i in range(npoints):
        p = [random.uniform(0, x) for x in extents]
        pointlist.append(p)
    points = np.array(pointlist)
    # do the clustering, recording the within group sum of squares
    nclusters_wgss_pairs = []
    allmeandist = kmeans.get_allmeandist(points)
    cluster_map = agglom.get_initial_cluster_map(points)
    b_ssd_map = agglom.get_initial_b_ssd_map(points)
    w_ssd_map = agglom.get_initial_w_ssd_map(points)
    q = agglom.get_initial_queue(b_ssd_map)
    while len(cluster_map) > 2:
        pair = agglom.get_pair_fast(cluster_map, q)
        agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair)
        indices = cluster_map.keys()
        wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices)
        nclusters_wgss_pairs.append((len(cluster_map), wgss))
    return nclusters_wgss_pairs
Example #3
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i + 1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            msg_a = 'expected the axis column %s ' % h
            msg_b = 'to be numeric'
            raise ValueError(msg_a + msg_b)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # precompute some stuff
    allmeandist = kmeans.get_allmeandist(points)
    nrestarts = 10
    nseconds = 2
    tm = time.time()
    n = len(points)
    wgss_list = []
    # neg because both items in the pair are used for sorting
    neg_calinski_k_pairs = []
    # look for the best calinski index in a small amount of time
    k = 2
    while True:
        codebook, distortion = cluster.vq.kmeans(points,
                                                 k,
                                                 iter=nrestarts,
                                                 thresh=1e-9)
        sqdists = kmeans.get_point_center_sqdists(points, codebook)
        labels = kmeans.get_labels_without_cluster_removal(sqdists)
        wgss = kmeans.get_wcss(sqdists, labels)
        bgss = allmeandist - wgss
        calinski = kmeans.get_calinski_index(bgss, wgss, k, n)
        k_unique = len(set(labels))
        neg_calinski_k_pairs.append((-calinski, k_unique))
        wgss_list.append(wgss)
        if time.time() - tm > nseconds:
            break
        if k == n - 1:
            break
        k += 1
    max_k = k
    best_neg_calinski, best_k = min(neg_calinski_k_pairs)
    best_calinski = -best_neg_calinski
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    print >> out, 'searched 2 <= k <= %d clusters' % max_k
    print >> out, '%.2f seconds' % (time.time() - tm)
    if fs.verbose:
        print >> out
        print >> out, '(k_unique, wgss, calinski):'
        for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs):
            neg_calinski, k_unique = neg_calinski_k_pair
            calinski = -neg_calinski
            row = [k_unique, wgss, calinski]
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()
Example #4
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i+1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            msg_a = 'expected the axis column %s ' % h
            msg_b = 'to be numeric'
            raise ValueError(msg_a + msg_b)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # precompute some stuff
    allmeandist = kmeans.get_allmeandist(points)
    nrestarts = 10
    nseconds = 2
    tm = time.time()
    n = len(points)
    wgss_list = []
    # neg because both items in the pair are used for sorting
    neg_calinski_k_pairs = []
    # look for the best calinski index in a small amount of time
    k = 2
    while True:
        codebook, distortion = cluster.vq.kmeans(
                points, k, iter=nrestarts, thresh=1e-9)
        sqdists = kmeans.get_point_center_sqdists(points, codebook)
        labels = kmeans.get_labels_without_cluster_removal(sqdists)
        wgss = kmeans.get_wcss(sqdists, labels)
        bgss = allmeandist - wgss
        calinski = kmeans.get_calinski_index(bgss, wgss, k, n)
        k_unique = len(set(labels))
        neg_calinski_k_pairs.append((-calinski, k_unique))
        wgss_list.append(wgss)
        if time.time() - tm > nseconds:
            break
        if k == n-1:
            break
        k += 1
    max_k = k
    best_neg_calinski, best_k = min(neg_calinski_k_pairs)
    best_calinski = -best_neg_calinski
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    print >> out, 'searched 2 <= k <= %d clusters' % max_k
    print >> out, '%.2f seconds' % (time.time() - tm)
    if fs.verbose:
        print >> out
        print >> out, '(k_unique, wgss, calinski):'
        for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs):
            neg_calinski, k_unique = neg_calinski_k_pair
            calinski = -neg_calinski
            row = [k_unique, wgss, calinski]
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()
Example #5
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i + 1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError('expected the axis column %s '
                             'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # do the clustering while computing the wgss at each merge
    cluster_counts = []
    wgss_values = []
    allmeandist = kmeans.get_allmeandist(points)
    cluster_map = agglom.get_initial_cluster_map(points)
    w_ssd_map = agglom.get_initial_w_ssd_map(points)
    b_ssd_map = agglom.get_initial_b_ssd_map(points)
    q = agglom.get_initial_queue(b_ssd_map)
    while len(cluster_map) > 2:
        # do an agglomeration step
        pair = agglom.get_pair_fast(cluster_map, q)
        agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair)
        # compute the within group sum of squares
        indices = cluster_map.keys()
        wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices)
        # compute the between group sum of squares
        bgss = allmeandist - wgss
        # append to the lists
        cluster_counts.append(len(cluster_map))
        wgss_values.append(wgss)
    # compute the log wgss values
    wlogs = np.log(wgss_values)
    # reverse the log values so that they are by increasing cluster size
    wlogs = list(reversed(wlogs))
    # sample from the null distribution
    extents = np.max(points, axis=0) - np.min(points, axis=0)
    nclusters_list, expectations, thresholds = do_sampling(
        extents, len(points), fs.nsamples)
    # get the gaps
    gaps = np.array(expectations) - wlogs
    # Get the best cluster count according to the gap statistic.
    best_i = None
    criteria = []
    for i, ip1 in iterutils.pairwise(range(len(nclusters_list))):
        k, kp1 = nclusters_list[i], nclusters_list[ip1]
        criterion = gaps[i] - gaps[ip1] + thresholds[ip1]
        criteria.append(criterion)
        if criterion > 0:
            if best_i is None:
                best_i = i
    best_k = nclusters_list[best_i]
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    if fs.verbose:
        print >> out
        print >> out, '(k, expected, observed, gap, threshold, criterion):'
        n = len(nclusters_list)
        for i, k in enumerate(nclusters_list):
            row = [k, expectations[i], wlogs[i], gaps[i], thresholds[i]]
            if i < n - 1:
                row += [criteria[i]]
            else:
                row += ['-']
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()
Example #6
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i+1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError(
                    'expected the axis column %s '
                    'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # do the clustering while computing the calinski index at each merge
    cluster_counts = []
    wgss_values = []
    neg_calinskis = []
    allmeandist = kmeans.get_allmeandist(points)
    cluster_map = agglom.get_initial_cluster_map(points)
    w_ssd_map = agglom.get_initial_w_ssd_map(points)
    b_ssd_map = agglom.get_initial_b_ssd_map(points)
    q = agglom.get_initial_queue(b_ssd_map)
    while len(cluster_map) > 2:
        # do an agglomeration step
        pair = agglom.get_pair_fast(cluster_map, q)
        agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair)
        # compute the within group sum of squares
        indices = cluster_map.keys()
        wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices)
        # compute the between group sum of squares
        bgss = allmeandist - wgss
        # get the calinksi index
        n = len(points)
        k = len(cluster_map)
        numerator = bgss / float(k - 1)
        denominator = wgss / float(n - k)
        calinski = numerator / denominator
        # append to the lists
        cluster_counts.append(k)
        wgss_values.append(wgss)
        neg_calinskis.append(-calinski)
    # Get the best cluster count according to the calinski index.
    # Do this trickery with negs so that it breaks ties
    # using the smallest number of clusters.
    neg_calinksi, best_k = min(zip(neg_calinskis, cluster_counts))
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    if fs.verbose:
        print >> out
        print >> out, '(k, wgss, calinski):'
        for k, wgss, neg_calinski in zip(
                cluster_counts, wgss_values, neg_calinskis):
            row = (k, wgss, -neg_calinski)
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()
Example #7
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i+1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError(
                    'expected the axis column %s '
                    'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # do the clustering while computing the wgss at each merge
    cluster_counts = []
    wgss_values = []
    allmeandist = kmeans.get_allmeandist(points)
    cluster_map = agglom.get_initial_cluster_map(points)
    w_ssd_map = agglom.get_initial_w_ssd_map(points)
    b_ssd_map = agglom.get_initial_b_ssd_map(points)
    q = agglom.get_initial_queue(b_ssd_map)
    while len(cluster_map) > 2:
        # do an agglomeration step
        pair = agglom.get_pair_fast(cluster_map, q)
        agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair)
        # compute the within group sum of squares
        indices = cluster_map.keys()
        wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices)
        # compute the between group sum of squares
        bgss = allmeandist - wgss
        # append to the lists
        cluster_counts.append(len(cluster_map))
        wgss_values.append(wgss)
    # compute the log wgss values
    wlogs = np.log(wgss_values)
    # reverse the log values so that they are by increasing cluster size
    wlogs = list(reversed(wlogs))
    # sample from the null distribution
    extents = np.max(points, axis=0) - np.min(points, axis=0)
    nclusters_list, expectations, thresholds = do_sampling(
            extents, len(points), fs.nsamples)
    # get the gaps
    gaps = np.array(expectations) - wlogs
    # Get the best cluster count according to the gap statistic.
    best_i = None
    criteria = []
    for i, ip1 in iterutils.pairwise(range(len(nclusters_list))):
        k, kp1 = nclusters_list[i], nclusters_list[ip1]
        criterion = gaps[i] - gaps[ip1] + thresholds[ip1]
        criteria.append(criterion)
        if criterion > 0:
            if best_i is None:
                best_i = i
    best_k = nclusters_list[best_i]
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    if fs.verbose:
        print >> out
        print >> out, '(k, expected, observed, gap, threshold, criterion):'
        n = len(nclusters_list)
        for i, k in enumerate(nclusters_list):
            row = [k, expectations[i], wlogs[i], gaps[i], thresholds[i]]
            if i < n-1:
                row += [criteria[i]]
            else:
                row += ['-']
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()
Example #8
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i + 1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError('expected the axis column %s '
                             'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # do the clustering while computing the calinski index at each merge
    cluster_counts = []
    wgss_values = []
    neg_calinskis = []
    allmeandist = kmeans.get_allmeandist(points)
    cluster_map = agglom.get_initial_cluster_map(points)
    w_ssd_map = agglom.get_initial_w_ssd_map(points)
    b_ssd_map = agglom.get_initial_b_ssd_map(points)
    q = agglom.get_initial_queue(b_ssd_map)
    while len(cluster_map) > 2:
        # do an agglomeration step
        pair = agglom.get_pair_fast(cluster_map, q)
        agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair)
        # compute the within group sum of squares
        indices = cluster_map.keys()
        wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices)
        # compute the between group sum of squares
        bgss = allmeandist - wgss
        # get the calinksi index
        n = len(points)
        k = len(cluster_map)
        numerator = bgss / float(k - 1)
        denominator = wgss / float(n - k)
        calinski = numerator / denominator
        # append to the lists
        cluster_counts.append(k)
        wgss_values.append(wgss)
        neg_calinskis.append(-calinski)
    # Get the best cluster count according to the calinski index.
    # Do this trickery with negs so that it breaks ties
    # using the smallest number of clusters.
    neg_calinksi, best_k = min(zip(neg_calinskis, cluster_counts))
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    if fs.verbose:
        print >> out
        print >> out, '(k, wgss, calinski):'
        for k, wgss, neg_calinski in zip(cluster_counts, wgss_values,
                                         neg_calinskis):
            row = (k, wgss, -neg_calinski)
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()