Beispiel #1
0
    def test_random_guesses(self):

        data = g_mnemstudio_example_data
        nclusters = 3

        #XXX copypasted
        # write an hdf file and create an associated data set
        name_hdf = get_tmp_filename()
        f_hdf = h5py.File(name_hdf)
        dset = f_hdf.create_dataset('testset', data=data)
        f_hdf.close()
        f_hdf = h5py.File(name_hdf, 'r')
        dset = f_hdf['testset']

        #XXX copypasted
        # write a tabular text file and re-open the file
        name_stream = get_tmp_filename()
        np.savetxt(name_stream, data)
        f_stream = open(name_stream)

        # check for errors in the simplest invocations
        bigkmeans.kmeans(data, nclusters=nclusters)
        bigkmeans.kmeans_hdf(dset, nclusters=nclusters)
        bigkmeans.kmeans_ooc(f_stream, nclusters=nclusters)

        #XXX copypasted
        # close the hdf file and the tabular data file
        f_hdf.close()
        f_stream.close()
Beispiel #2
0
    def test_cluster_loss_errors(self):

        # this is a data and guess where a cluster is lost eventually
        data = g_pathological_data
        guess = g_pathological_guess

        #XXX copypasted
        # write an hdf file and create an associated data set
        name_hdf = get_tmp_filename()
        f_hdf = h5py.File(name_hdf)
        dset = f_hdf.create_dataset('testset', data=data)
        f_hdf.close()
        f_hdf = h5py.File(name_hdf, 'r')
        dset = f_hdf['testset']

        #XXX copypasted
        # write a tabular text file and re-open the file
        name_stream = get_tmp_filename()
        np.savetxt(name_stream, data)
        f_stream = open(name_stream)

        # The following three blocks should check errors.

        # numpy
        testing.assert_raises(
                bigkmeans.ClusterLossError,
                bigkmeans.kmeans,
                data,
                centroids=guess,
                on_cluster_loss=bigkmeans.error_on_cluster_loss,
                )

        # hdf
        testing.assert_raises(
                bigkmeans.ClusterLossError,
                bigkmeans.kmeans_hdf,
                dset,
                centroids=guess,
                on_cluster_loss=bigkmeans.error_on_cluster_loss,
                )

        # out-of-core stream
        testing.assert_raises(
                bigkmeans.ClusterLossError,
                bigkmeans.kmeans_ooc,
                f_stream,
                centroids=guess,
                on_cluster_loss=bigkmeans.error_on_cluster_loss,
                )

        # Check that errors are not raised through these calls.
        # Although a large number of restarts may occur...
        benign_action_args = (
                None,
                bigkmeans.ignore_cluster_loss,
                bigkmeans.return_on_cluster_loss,
                bigkmeans.retry_after_cluster_loss,
                )
        for fn in benign_action_args:
            bigkmeans.kmeans(data, centroids=guess, on_cluster_loss=fn)
            bigkmeans.kmeans_hdf(dset, centroids=guess, on_cluster_loss=fn)
            bigkmeans.kmeans_ooc(f_stream, centroids=guess, on_cluster_loss=fn)

        # close the hdf file and the tabular data file
        f_hdf.close()
        f_stream.close()
Beispiel #3
0
    def helper(
            self, data,
            maxiters=None, guess=None, nclusters=None,
            on_cluster_loss=None,
            ):

        # if no guess has been provided then we make a guess
        if guess is None:
            M, N = data.shape
            indices = sorted(random.sample(xrange(M), nclusters))
            guess = data[indices, :]

        # write an hdf file and create an associated data set
        name_hdf = get_tmp_filename()
        f_hdf = h5py.File(name_hdf)
        dset = f_hdf.create_dataset('testset', data=data)
        f_hdf.close()
        f_hdf = h5py.File(name_hdf, 'r')
        dset = f_hdf['testset']

        # write a tabular text file and re-open the file
        name_stream = get_tmp_filename()
        np.savetxt(name_stream, data)
        f_stream = open(name_stream)

        # check results for various vector quantization inner loops
        for fn_block_update in (
                bigkmeans.lloyd.update_block_pyvqcore,
                bigkmeans.lloyd.update_block_scipy,
                bigkmeans.lloyd.update_block_python,
                ):

            # get the scipy kmeans results
            vq_final_clust, vq_labels = scipy_cluster.vq.kmeans2(
                    data, guess, maxiters)

            # get the bigkmeans numpy results
            results = bigkmeans.kmeans(
                    data, centroids=guess, maxiters=maxiters,
                    on_cluster_loss=on_cluster_loss,
                    fn_block_update=fn_block_update,
                    )
            np_init_clust, np_final_clust, np_labels = results

            # get the bigkmeans hdf results
            results = bigkmeans.kmeans_hdf(
                    dset, centroids=guess, maxiters=maxiters,
                    on_cluster_loss=on_cluster_loss,
                    fn_block_update=fn_block_update,
                    )
            hdf_init_clust, hdf_final_clust, hdf_labels = results

            # get the bigkmeans tabular text-based out-of-core results
            results = bigkmeans.kmeans_ooc(
                    f_stream, centroids=guess, maxiters=maxiters,
                    on_cluster_loss=on_cluster_loss,
                    fn_block_update=fn_block_update,
                    )
            ooc_init_clust, ooc_final_clust, ooc_labels = results

            # check that the outputs are the same for all methods
            for labels, final_clust in (
                    (np_labels, np_final_clust),
                    (hdf_labels, hdf_final_clust),
                    (ooc_labels, ooc_final_clust),
                    ):
                testing.assert_allclose(vq_final_clust, final_clust)
                testing.assert_allclose(vq_labels, labels)

        # close the hdf file and the tabular data file
        f_hdf.close()
        f_stream.close()
def main(args):

    # Optionally read the initial centroids.
    guess = None
    if args.initial_centroids:
        guess = np.loadtxt(args.initial_centroids, dtype=float, ndmin=2)

    # Optionally specify an inner loop implementation choice.
    fn_block_update = None
    if args.inner_loop:
        inner_loop_dict = {
                'pyvqcore' : bigkmeans.lloyd.update_block_pyvqcore,
                'scipy' : bigkmeans.lloyd.update_block_scipy,
                'python' : bigkmeans.lloyd.update_block_python,
                }
        fn_block_update = inner_loop_dict[args.inner_loop]

    # Open the data file and do the kmeans clustering.
    # Note that we deliberately disallow using stdin
    # because we require that the stream can be restarted
    # so that we can do one pass through the open file per iteration.
    if args.tabular_data_file:
        with open(args.tabular_data_file) as data_stream:
            guess, centroids, labels = bigkmeans.kmeans_ooc(
                    data_stream,
                    centroids=guess,
                    nclusters=args.nclusters,
                    on_cluster_loss=args.on_cluster_loss,
                    maxiters=args.maxiters,
                    maxrestarts=args.maxrestarts,
                    fn_block_update=fn_block_update,
                    verbose=args.verbose,
                    )
    elif args.hdf_data_file:
        if not h5py:
            raise ImportError(
                    'sorry I cannot deal with hdf5 data files '
                    'unless the python package h5py is installed')
        if not args.hdf_dataset_name:
            raise Exception(
                    'If the data is in hdf format '
                    'then an hdf dataset name (--hdf-dataset-name) '
                    'must be specified '
                    'in addition to the name of the hdf file.  '
                    'If you do not know the dataset name, '
                    'then you can try to use the program called hdfview '
                    'to search for your dataset within your hdf file.')
        f = h5py.File(args.hdf_data_file, 'r')
        dset = f[args.hdf_dataset_name]
        guess, centroids, labels = bigkmeans.kmeans_hdf(
                dset,
                centroids=guess,
                nclusters=args.nclusters,
                on_cluster_loss=args.on_cluster_loss,
                maxiters=args.maxiters,
                maxrestarts=args.maxrestarts,
                fn_block_update=fn_block_update,
                verbose=args.verbose,
                )
        f.close()

    # write the labels to stdout or to a user-specified file
    if args.labels_out == '-':
        np.savetxt(sys.stdout, labels, '%d')
    elif args.labels_out:
        np.savetxt(args.labels_out, labels, '%d')

    # optionally write the centroids
    if args.centroids_out == '-':
        np.savetxt(sys.stdout, centroids)
    elif args.centroids_out:
        np.savetxt(args.centroids_out, centroids)