コード例 #1
0
    def test_random_guesses(self):

        data = g_mnemstudio_example_data
        nclusters = 3

        #XXX copypasted
        # write an hdf file and create an associated data set
        name_hdf = get_tmp_filename()
        f_hdf = h5py.File(name_hdf)
        dset = f_hdf.create_dataset('testset', data=data)
        f_hdf.close()
        f_hdf = h5py.File(name_hdf, 'r')
        dset = f_hdf['testset']

        #XXX copypasted
        # write a tabular text file and re-open the file
        name_stream = get_tmp_filename()
        np.savetxt(name_stream, data)
        f_stream = open(name_stream)

        # check for errors in the simplest invocations
        bigkmeans.kmeans(data, nclusters=nclusters)
        bigkmeans.kmeans_hdf(dset, nclusters=nclusters)
        bigkmeans.kmeans_ooc(f_stream, nclusters=nclusters)

        #XXX copypasted
        # close the hdf file and the tabular data file
        f_hdf.close()
        f_stream.close()
コード例 #2
0
    def test_fisher_iris_empty_cluster_guess(self):
        #FIXME: this test is skipped because
        #       scipy seems to use a different way to deal with the clusters.
        #       perhaps it returns the current labeling when
        #       an empty cluster is detected in the next labeling?

        # define the data
        data = np.loadtxt(
                StringIO(g_fisher),
                dtype=float,
                skiprows=1,
                usecols=(1,2,3,4),
                )

        # this guess of initial centroids is known to lead to cluster loss
        guess = g_fisher_bad_guess

        # use a few iterations
        maxiters = 10

        # get the scipy kmeans results
        vq_final_clust, vq_labels = scipy_cluster.vq.kmeans2(
                data, guess, maxiters)

        # get the bigkmeans numpy results
        np_init_clust, np_final_clust, np_labels = bigkmeans.kmeans(
                data, centroids=guess, maxiters=maxiters,
                on_cluster_loss=bigkmeans.return_on_cluster_loss)

        print 'scipy labels:'
        print vq_labels
        print
        print 'bigkmeans labels:'
        print np_labels
        print

        # Do some unsupervised clustering on this data set,
        # using more than the three putative clusters.
        self.helper(
                data, maxiters=maxiters, guess=guess,
                on_cluster_loss=bigkmeans.return_on_cluster_loss)
コード例 #3
0
    def test_cluster_loss_errors(self):

        # this is a data and guess where a cluster is lost eventually
        data = g_pathological_data
        guess = g_pathological_guess

        #XXX copypasted
        # write an hdf file and create an associated data set
        name_hdf = get_tmp_filename()
        f_hdf = h5py.File(name_hdf)
        dset = f_hdf.create_dataset('testset', data=data)
        f_hdf.close()
        f_hdf = h5py.File(name_hdf, 'r')
        dset = f_hdf['testset']

        #XXX copypasted
        # write a tabular text file and re-open the file
        name_stream = get_tmp_filename()
        np.savetxt(name_stream, data)
        f_stream = open(name_stream)

        # The following three blocks should check errors.

        # numpy
        testing.assert_raises(
                bigkmeans.ClusterLossError,
                bigkmeans.kmeans,
                data,
                centroids=guess,
                on_cluster_loss=bigkmeans.error_on_cluster_loss,
                )

        # hdf
        testing.assert_raises(
                bigkmeans.ClusterLossError,
                bigkmeans.kmeans_hdf,
                dset,
                centroids=guess,
                on_cluster_loss=bigkmeans.error_on_cluster_loss,
                )

        # out-of-core stream
        testing.assert_raises(
                bigkmeans.ClusterLossError,
                bigkmeans.kmeans_ooc,
                f_stream,
                centroids=guess,
                on_cluster_loss=bigkmeans.error_on_cluster_loss,
                )

        # Check that errors are not raised through these calls.
        # Although a large number of restarts may occur...
        benign_action_args = (
                None,
                bigkmeans.ignore_cluster_loss,
                bigkmeans.return_on_cluster_loss,
                bigkmeans.retry_after_cluster_loss,
                )
        for fn in benign_action_args:
            bigkmeans.kmeans(data, centroids=guess, on_cluster_loss=fn)
            bigkmeans.kmeans_hdf(dset, centroids=guess, on_cluster_loss=fn)
            bigkmeans.kmeans_ooc(f_stream, centroids=guess, on_cluster_loss=fn)

        # close the hdf file and the tabular data file
        f_hdf.close()
        f_stream.close()
コード例 #4
0
    def helper(
            self, data,
            maxiters=None, guess=None, nclusters=None,
            on_cluster_loss=None,
            ):

        # if no guess has been provided then we make a guess
        if guess is None:
            M, N = data.shape
            indices = sorted(random.sample(xrange(M), nclusters))
            guess = data[indices, :]

        # write an hdf file and create an associated data set
        name_hdf = get_tmp_filename()
        f_hdf = h5py.File(name_hdf)
        dset = f_hdf.create_dataset('testset', data=data)
        f_hdf.close()
        f_hdf = h5py.File(name_hdf, 'r')
        dset = f_hdf['testset']

        # write a tabular text file and re-open the file
        name_stream = get_tmp_filename()
        np.savetxt(name_stream, data)
        f_stream = open(name_stream)

        # check results for various vector quantization inner loops
        for fn_block_update in (
                bigkmeans.lloyd.update_block_pyvqcore,
                bigkmeans.lloyd.update_block_scipy,
                bigkmeans.lloyd.update_block_python,
                ):

            # get the scipy kmeans results
            vq_final_clust, vq_labels = scipy_cluster.vq.kmeans2(
                    data, guess, maxiters)

            # get the bigkmeans numpy results
            results = bigkmeans.kmeans(
                    data, centroids=guess, maxiters=maxiters,
                    on_cluster_loss=on_cluster_loss,
                    fn_block_update=fn_block_update,
                    )
            np_init_clust, np_final_clust, np_labels = results

            # get the bigkmeans hdf results
            results = bigkmeans.kmeans_hdf(
                    dset, centroids=guess, maxiters=maxiters,
                    on_cluster_loss=on_cluster_loss,
                    fn_block_update=fn_block_update,
                    )
            hdf_init_clust, hdf_final_clust, hdf_labels = results

            # get the bigkmeans tabular text-based out-of-core results
            results = bigkmeans.kmeans_ooc(
                    f_stream, centroids=guess, maxiters=maxiters,
                    on_cluster_loss=on_cluster_loss,
                    fn_block_update=fn_block_update,
                    )
            ooc_init_clust, ooc_final_clust, ooc_labels = results

            # check that the outputs are the same for all methods
            for labels, final_clust in (
                    (np_labels, np_final_clust),
                    (hdf_labels, hdf_final_clust),
                    (ooc_labels, ooc_final_clust),
                    ):
                testing.assert_allclose(vq_final_clust, final_clust)
                testing.assert_allclose(vq_labels, labels)

        # close the hdf file and the tabular data file
        f_hdf.close()
        f_stream.close()