filename = 'pendigits.pkl.gz' path = '/home/bo/Data/Pendigits/' dataset = path + filename # perform KM with gzip.open(dataset, 'rb') as f: train_x, train_y = cPickle.load(f) km_model = KMeans(n_clusters=K, n_init=1) results_KM = np.zeros((trials, 3)) for i in range(trials): ypred = km_model.fit_predict(train_x) nmi = metrics.adjusted_mutual_info_score(train_y, ypred) ari = metrics.adjusted_rand_score(train_y, ypred) ac = acc(ypred, train_y) results_KM[i] = np.array([nmi, ari, ac]) KM_mean = np.mean(results_KM, axis=0) KM_std = np.std(results_KM, axis=0) # Perform SC print('SC started...') results_SC = np.zeros((trials, 3)) se_model = SpectralEmbedding(n_components=K, affinity='rbf', gamma=0.1) se_vec = se_model.fit_transform(train_x) for i in range(trials): ypred = km_model.fit_predict(se_vec) nmi = metrics.adjusted_mutual_info_score(train_y, ypred) ari = metrics.adjusted_rand_score(train_y, ypred) ac = acc(ypred, train_y) results_SC[i] = np.array([nmi, ari, ac])
from sklearn import metrics from cluster_acc import acc nmi = metrics.normalized_mutual_info_score(train_label,my_dp.z) ari = metrics.adjusted_rand_score(train_label,my_dp.z) label_uni = len(np.unique(my_dp.z)) label_true_uni = len(np.unique(train_label)) if label_uni <label_true_uni+1: acc_value = acc(my_dp.z,train_label) else: acc_value=0
def test_SdC(Init = '', lbd = .01, output_dir='MNIST_results', save_file = '', beta = 1, finetune_lr= .005, mu = 0.9, pretraining_epochs=50, pretrain_lr=.001, training_epochs=150, dataset='toy.pkl.gz', batch_size=20, nClass = 4, hidden_dim = [100, 50, 2], diminishing = True): """ :type Init: string :param Init: a string contains the filename of a saved network, the saved network can be loaded to initialize the network. Leave this parameter be an empty string if no saved network available. If failed to find the specified file, the program will initialized the network randomly. :type lbd: float :param lbd: tuning parameter, multiplied on reconstruction error, i.e. the larger lbd the larger weight on minimizing reconstruction error. :type output_dir: string :param output_dir: the location to save trained network :type save_file: string :param save_file: the filename to save trained network :type beta: float :param beta: the parameter for the clustering term, set to 0 if a pure SAE (without clustering regularization) is intended. :type finetune_lr: float :param finetune_lr: learning rate used in the finetune stage (factor for the stochastic gradient) :type mu: float :param mu: extrapolation parameter used for implementing Nesterov-type acceleration :type pretraining_epochs: int :param pretraining_epochs: number of epoch to do pretraining :type pretrain_lr: float :param pretrain_lr: learning rate to be used during pre-training :type training_epochs: int :param training_epochs: number of epoch to do optimization :type dataset: string :param dataset: path of the pickled dataset :type batch_size: int :param batch_size: number of data samples in one minibatch :type nClass: int :param nClass: number of clusters :hidden dim: array :param hidden_dim: the number of neurons in each hidden layer in the forward network, the reconstruction part has a mirror-image structure :type diminishing: boolean :param diminishing: whether or not to reduce learning rate during optimization, if True, the learning rate is halfed every 5 epochs. """ datasets = load_data_shared(dataset, batch_size) working_dir = os.getcwd() train_set_x, train_set_y = datasets[0] inDim = train_set_x.get_value().shape[1] label_true = numpy.squeeze(numpy.int32(train_set_y.get_value(borrow=True))) index = T.lscalar() x = T.matrix('x') # compute number of minibatches for training, validation and testing n_train_samples = train_set_x.get_value(borrow=True).shape[0] n_train_batches = n_train_samples n_train_batches /= batch_size # numpy random generator # start-snippet-3 numpy_rng = numpy.random.RandomState(89677) # numpy_rng = numpy.random.RandomState() print '... building the model' try: os.chdir(output_dir) except OSError: os.mkdir(output_dir) os.chdir(output_dir) # construct the stacked denoising autoencoder class if Init == '': sdc = SdC( numpy_rng=numpy_rng, n_ins=inDim, lbd = lbd, beta = beta, input=x, hidden_layers_sizes= hidden_dim ) else: try: with gzip.open(Init, 'rb') as f: saved_params = cPickle.load(f)['network'] sdc = SdC( numpy_rng=numpy_rng, n_ins=inDim, lbd = lbd, beta = beta, input=x, hidden_layers_sizes= hidden_dim, Param_init = saved_params ) print '... loading saved network succeeded' except IOError: print >> sys.stderr, ('Cannot find the specified saved network, using random initializations.') sdc = SdC( numpy_rng=numpy_rng, n_ins=inDim, lbd = lbd, beta = beta, input=x, hidden_layers_sizes= hidden_dim ) ######################### # PRETRAINING THE MODEL # ######################### if pretraining_epochs == 0 or Init != '': print '... skipping pretraining' else: print '... getting the pretraining functions' pretraining_fns = sdc.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size, mu = mu) print '... pre-training the model' start_time = timeit.default_timer() ## Pre-train layer-wise corruption_levels = 0*numpy.ones(len(hidden_dim), dtype = numpy.float32) pretrain_lr_shared = theano.shared(numpy.asarray(pretrain_lr, dtype='float32'), borrow=True) for i in xrange(sdc.n_layers): # go through pretraining epochs iter = 0 for epoch in xrange(pretraining_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches): iter = (epoch) * n_train_batches + batch_index pretrain_lr_shared.set_value( numpy.float32(pretrain_lr) ) cost = pretraining_fns[i](index=batch_index, corruption=corruption_levels[i], lr=pretrain_lr_shared.get_value()) c.append(cost) print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) end_time = timeit.default_timer() print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) network = [param.get_value() for param in sdc.params] package = {'network': network} with gzip.open('deepclus_'+str(nClass)+ '_pretrain.pkl.gz', 'wb') as f: cPickle.dump(package, f, protocol=cPickle.HIGHEST_PROTOCOL) ######################## # FINETUNING THE MODEL # ######################## km = KMeans(n_clusters = nClass) out = sdc.get_output() out_sdc = theano.function( [index], outputs = out, givens = {x: train_set_x[index * batch_size: (index + 1) * batch_size]} ) hidden_val = [] for batch_index in xrange(n_train_batches): hidden_val.append(out_sdc(batch_index)) hidden_array = numpy.asarray(hidden_val) hidden_size = hidden_array.shape hidden_array = numpy.reshape(hidden_array, (hidden_size[0] * hidden_size[1], hidden_size[2] )) hidden_zero = numpy.zeros_like(hidden_array) zeros_count = numpy.sum(numpy.equal(hidden_array, hidden_zero), axis = 0) # # Do a k-means clusering to get center_array km_idx = km.fit_predict(hidden_array) centers = km.cluster_centers_.astype(numpy.float32) center_shared = theano.shared(numpy.zeros((batch_size, hidden_dim[-1]) , dtype='float32'), borrow=True) nmi = metrics.normalized_mutual_info_score(label_true, km_idx) print >> sys.stderr, ('Initial NMI for deep clustering: %.2f' % (nmi)) ari = metrics.adjusted_rand_score(label_true, km_idx) print >> sys.stderr, ('ARI for deep clustering: %.2f' % (ari)) try: ac = acc(km_idx, label_true) except AssertionError: ac = 0 print('Number of predicted cluster mismatch with ground truth.') print >> sys.stderr, ('ACC for deep clustering: %.2f' % (ac)) lr_shared = theano.shared(numpy.asarray(finetune_lr, dtype='float32'), borrow=True) print '... getting the finetuning functions' train_fn = sdc.build_finetune_functions( datasets=datasets, centers=center_shared , batch_size=batch_size, mu = mu, learning_rate=lr_shared ) print '... finetunning the model' start_time = timeit.default_timer() done_looping = False epoch = 0 res_metrics = numpy.zeros((training_epochs/5 + 1, 3), dtype = numpy.float32) res_metrics[0] = numpy.array([nmi, ari, ac]) count = 100*numpy.ones(nClass, dtype = numpy.int) while (epoch < training_epochs) and (not done_looping): epoch = epoch + 1 c = [] # total cost d = [] # cost of reconstruction e = [] # cost of clustering f = [] # learning_rate g = [] # count the number of assigned data sample # perform random initialization of centroid if empty cluster happens count_samples = numpy.zeros((nClass)) for minibatch_index in xrange(n_train_batches): # calculate the stepsize iter = (epoch - 1) * n_train_batches + minibatch_index lr_shared.set_value( numpy.float32(finetune_lr) ) center_shared.set_value(centers[km_idx[minibatch_index * batch_size: (minibatch_index +1 ) * batch_size]]) # lr_shared.set_value( numpy.float32(finetune_lr/numpy.sqrt(epoch)) ) cost = train_fn(minibatch_index) hidden_val = out_sdc(minibatch_index) # get the hidden value, to update KM # Perform mini-batch KM temp_idx, centers, count = batch_km(hidden_val, centers, count) # for i in range(nClass): # count_samples[i] += temp_idx.shape[0] - numpy.count_nonzero(temp_idx - i) # center_shared.set_value(numpy.float32(temp_center)) km_idx[minibatch_index * batch_size: (minibatch_index +1 ) * batch_size] = temp_idx c.append(cost[0]) d.append(cost[1]) e.append(cost[2]) f.append(cost[3]) g.append(cost[4]) # check if empty cluster happen, if it does random initialize it # for i in range(nClass): # if count_samples[i] == 0: # rand_idx = numpy.random.randint(low = 0, high = n_train_samples) # # modify the centroid # centers[i] = out_single(rand_idx) print 'Fine-tuning epoch %d ++++ \n' % (epoch), print ('Total cost: %.5f, '%(numpy.mean(c)) + 'Reconstruction: %.5f, ' %(numpy.mean(d)) + "Clustering: %.5f, " %(numpy.mean(e)) ) # print 'Learning rate: %.6f' %numpy.mean(f) # half the learning rate every 5 epochs if epoch % 10 == 0 and diminishing == True: finetune_lr /= 2 # evaluate the clustering performance every 5 epoches if epoch % 5 == 0: nmi = metrics.normalized_mutual_info_score(label_true, km_idx) ari = metrics.adjusted_rand_score(label_true, km_idx) try: ac = acc(km_idx, label_true) except AssertionError: ac = 0 print('Number of predicted cluster mismatch with ground truth.') res_metrics[epoch/5] = numpy.array([nmi, ari, ac]) # get the hidden values, to make a plot hidden_val = [] for batch_index in xrange(n_train_batches): hidden_val.append(out_sdc(batch_index)) hidden_array = numpy.asarray(hidden_val) hidden_size = hidden_array.shape hidden_array = numpy.reshape(hidden_array, (hidden_size[0] * hidden_size[1], hidden_size[2] )) err = numpy.mean(d) print >> sys.stderr, ('Average squared 2-D reconstruction error: %.4f' %err) end_time = timeit.default_timer() ypred = km_idx nmi = metrics.normalized_mutual_info_score(label_true, ypred) print >> sys.stderr, ('NMI for deep clustering: %.2f' % (nmi)) ari = metrics.adjusted_rand_score(label_true, ypred) print >> sys.stderr, ('ARI for deep clustering: %.2f' % (ari)) try: ac = acc(ypred, label_true) except AssertionError: ac = 0 print('Number of predicted cluster mismatch with ground truth.') print >> sys.stderr, ('ACC for deep clustering: %.2f' % (ac)) config = {'lbd': lbd, 'beta': beta, 'pretraining_epochs': pretraining_epochs, 'pretrain_lr': pretrain_lr, 'mu': mu, 'finetune_lr': finetune_lr, 'training_epochs': training_epochs, 'dataset': dataset, 'batch_size': batch_size, 'nClass': nClass, 'hidden_dim': hidden_dim} results = {'result': res_metrics} network = [param.get_value() for param in sdc.params] package = {'config': config, 'results': results, 'network': network} with gzip.open(save_file, 'wb') as f: cPickle.dump(package, f, protocol=cPickle.HIGHEST_PROTOCOL) os.chdir(working_dir) print >> sys.stderr, ('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return res_metrics
filename = 'pendigits.pkl.gz' path = '/home/bo/Data/Pendigits/' dataset = path+filename # perform KM with gzip.open(dataset, 'rb') as f: train_x, train_y = cPickle.load(f) km_model = KMeans(n_clusters = K, n_init = 1) results_KM = np.zeros((trials, 3)) for i in range(trials): ypred = km_model.fit_predict(train_x) nmi = metrics.adjusted_mutual_info_score(train_y, ypred) ari = metrics.adjusted_rand_score(train_y, ypred) ac = acc(ypred, train_y) results_KM[i] = np.array([nmi, ari, ac]) KM_mean = np.mean(results_KM, axis = 0) KM_std = np.std(results_KM, axis = 0) # Perform SC print('SC started...') results_SC = np.zeros((trials, 3)) se_model = SpectralEmbedding(n_components=K, affinity='rbf', gamma = 0.1) se_vec = se_model.fit_transform(train_x) for i in range(trials): ypred = km_model.fit_predict(se_vec) nmi = metrics.adjusted_mutual_info_score(train_y, ypred) ari = metrics.adjusted_rand_score(train_y, ypred) ac = acc(ypred, train_y) results_SC[i] = np.array([nmi, ari, ac])