def test_random_guesses(self): data = g_mnemstudio_example_data nclusters = 3 #XXX copypasted # write an hdf file and create an associated data set name_hdf = get_tmp_filename() f_hdf = h5py.File(name_hdf) dset = f_hdf.create_dataset('testset', data=data) f_hdf.close() f_hdf = h5py.File(name_hdf, 'r') dset = f_hdf['testset'] #XXX copypasted # write a tabular text file and re-open the file name_stream = get_tmp_filename() np.savetxt(name_stream, data) f_stream = open(name_stream) # check for errors in the simplest invocations bigkmeans.kmeans(data, nclusters=nclusters) bigkmeans.kmeans_hdf(dset, nclusters=nclusters) bigkmeans.kmeans_ooc(f_stream, nclusters=nclusters) #XXX copypasted # close the hdf file and the tabular data file f_hdf.close() f_stream.close()
def test_fisher_iris_empty_cluster_guess(self): #FIXME: this test is skipped because # scipy seems to use a different way to deal with the clusters. # perhaps it returns the current labeling when # an empty cluster is detected in the next labeling? # define the data data = np.loadtxt( StringIO(g_fisher), dtype=float, skiprows=1, usecols=(1,2,3,4), ) # this guess of initial centroids is known to lead to cluster loss guess = g_fisher_bad_guess # use a few iterations maxiters = 10 # get the scipy kmeans results vq_final_clust, vq_labels = scipy_cluster.vq.kmeans2( data, guess, maxiters) # get the bigkmeans numpy results np_init_clust, np_final_clust, np_labels = bigkmeans.kmeans( data, centroids=guess, maxiters=maxiters, on_cluster_loss=bigkmeans.return_on_cluster_loss) print 'scipy labels:' print vq_labels print print 'bigkmeans labels:' print np_labels print # Do some unsupervised clustering on this data set, # using more than the three putative clusters. self.helper( data, maxiters=maxiters, guess=guess, on_cluster_loss=bigkmeans.return_on_cluster_loss)
def test_cluster_loss_errors(self): # this is a data and guess where a cluster is lost eventually data = g_pathological_data guess = g_pathological_guess #XXX copypasted # write an hdf file and create an associated data set name_hdf = get_tmp_filename() f_hdf = h5py.File(name_hdf) dset = f_hdf.create_dataset('testset', data=data) f_hdf.close() f_hdf = h5py.File(name_hdf, 'r') dset = f_hdf['testset'] #XXX copypasted # write a tabular text file and re-open the file name_stream = get_tmp_filename() np.savetxt(name_stream, data) f_stream = open(name_stream) # The following three blocks should check errors. # numpy testing.assert_raises( bigkmeans.ClusterLossError, bigkmeans.kmeans, data, centroids=guess, on_cluster_loss=bigkmeans.error_on_cluster_loss, ) # hdf testing.assert_raises( bigkmeans.ClusterLossError, bigkmeans.kmeans_hdf, dset, centroids=guess, on_cluster_loss=bigkmeans.error_on_cluster_loss, ) # out-of-core stream testing.assert_raises( bigkmeans.ClusterLossError, bigkmeans.kmeans_ooc, f_stream, centroids=guess, on_cluster_loss=bigkmeans.error_on_cluster_loss, ) # Check that errors are not raised through these calls. # Although a large number of restarts may occur... benign_action_args = ( None, bigkmeans.ignore_cluster_loss, bigkmeans.return_on_cluster_loss, bigkmeans.retry_after_cluster_loss, ) for fn in benign_action_args: bigkmeans.kmeans(data, centroids=guess, on_cluster_loss=fn) bigkmeans.kmeans_hdf(dset, centroids=guess, on_cluster_loss=fn) bigkmeans.kmeans_ooc(f_stream, centroids=guess, on_cluster_loss=fn) # close the hdf file and the tabular data file f_hdf.close() f_stream.close()
def helper( self, data, maxiters=None, guess=None, nclusters=None, on_cluster_loss=None, ): # if no guess has been provided then we make a guess if guess is None: M, N = data.shape indices = sorted(random.sample(xrange(M), nclusters)) guess = data[indices, :] # write an hdf file and create an associated data set name_hdf = get_tmp_filename() f_hdf = h5py.File(name_hdf) dset = f_hdf.create_dataset('testset', data=data) f_hdf.close() f_hdf = h5py.File(name_hdf, 'r') dset = f_hdf['testset'] # write a tabular text file and re-open the file name_stream = get_tmp_filename() np.savetxt(name_stream, data) f_stream = open(name_stream) # check results for various vector quantization inner loops for fn_block_update in ( bigkmeans.lloyd.update_block_pyvqcore, bigkmeans.lloyd.update_block_scipy, bigkmeans.lloyd.update_block_python, ): # get the scipy kmeans results vq_final_clust, vq_labels = scipy_cluster.vq.kmeans2( data, guess, maxiters) # get the bigkmeans numpy results results = bigkmeans.kmeans( data, centroids=guess, maxiters=maxiters, on_cluster_loss=on_cluster_loss, fn_block_update=fn_block_update, ) np_init_clust, np_final_clust, np_labels = results # get the bigkmeans hdf results results = bigkmeans.kmeans_hdf( dset, centroids=guess, maxiters=maxiters, on_cluster_loss=on_cluster_loss, fn_block_update=fn_block_update, ) hdf_init_clust, hdf_final_clust, hdf_labels = results # get the bigkmeans tabular text-based out-of-core results results = bigkmeans.kmeans_ooc( f_stream, centroids=guess, maxiters=maxiters, on_cluster_loss=on_cluster_loss, fn_block_update=fn_block_update, ) ooc_init_clust, ooc_final_clust, ooc_labels = results # check that the outputs are the same for all methods for labels, final_clust in ( (np_labels, np_final_clust), (hdf_labels, hdf_final_clust), (ooc_labels, ooc_final_clust), ): testing.assert_allclose(vq_final_clust, final_clust) testing.assert_allclose(vq_labels, labels) # close the hdf file and the tabular data file f_hdf.close() f_stream.close()