def print_stats(self, x, y, x_test, y_test, loss, epoch, logwriter, prefix, stats_path=None): q, _ = self.model.predict(x, verbose=0) # evaluate the clustering performance y_pred = q.argmax(1) acc = np.round(cluster_acc(y, y_pred), 5) nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5) ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5) loss = np.round(loss, 5) logdict = dict(iter=epoch, acc=acc, nmi=nmi, ari=ari, L=loss[0], Lc=loss[1], Lr=loss[2]) logwriter.writerow(logdict) # compute constraints satisfaction sat = 0.0 if ml_ind1 is not None and cl_ind1 is not None and len(ml_ind1) + len( cl_ind1) > 0: for i in range(len(ml_ind1)): if y_pred[ml_ind1[i]] == y_pred[ml_ind2[i]]: sat += 1.0 for i in range(len(cl_ind1)): if y_pred[cl_ind1[i]] != y_pred[cl_ind2[i]]: sat += 1.0 sat /= float(len(ml_ind2) + len(cl_ind1)) if x_test is not None and y_test is not None: q_test, _ = self.model.predict(x_test, verbose=0) # evaluate the clustering performance y_pred_test = q_test.argmax(1) acc_test = np.round(cluster_acc(y_test, y_pred_test), 5) nmi_test = np.round( metrics.normalized_mutual_info_score(y_test, y_pred_test), 5) ari_test = np.round( metrics.adjusted_rand_score(y_test, y_pred_test), 5) print(prefix, ' sat: ', sat, 'ari:', ari, 'acc:', acc, 'nmi:', nmi, ' ### ari_test:', ari_test, 'acc_test:', acc_test, 'nmi_test:', nmi_test) if stats_path is not None: with open(stats_path, "a+") as file: content = self.dataset_name+';'+prefix+';'+self.save_suffix+';'+str(sat)+';'+str(ari)+';'+str(acc)+';'+\ str(nmi)+';'+str(ari_test)+';'+str(acc_test)+';'+str(nmi_test)+'\n' file.write(content) return sat
def train_dec(x, y, n_clusters, save_dir): batch_size = 256 lr = 0.01 momentum = 0.9 tol = 0.001 maxiter = 3e4 update_interval = 1e3 dec = DEC(dims=[x.shape[-1], 500, 500, 2000, 10], n_clusters=n_clusters, batch_size=batch_size) dec.initialize_model(optimizer=SGD(lr=lr, momentum=momentum), ae_weights='../DEC-keras/ae_weights_snh.h5', x=x) try: dec_snh.load_weights(save_dir + '/DEC_model_final.h5') y_pred = dec_snh.predict_clusters(x) except IOError: t0 = time() y_pred = dec_snh.clustering(x, y=y, tol=tol, maxiter=maxiter, update_interval=update_interval, save_dir=save_dir) print('clustering time: ', (time() - t0)) print('acc:', cluster_acc(y, y_pred))
def idec(dataset="mnist", gamma=0.1, maxiter=2e4, update_interval=20, tol=0.00001, batch_size=256): maxiter = maxiter gamma = gamma update_interval = update_interval tol = tol batch_size = batch_size ae_weights = ("ae_weights/" + dataset + "_ae_weights/" + dataset + "_ae_weights.h5") optimizer = SGD(lr=0.01, momentum=0.9) from datasets import load_mnist, load_usps, load_stl, load_cifar if dataset == 'mnist': # recommends: n_clusters=10, update_interval=140 x, y = load_mnist('./data/mnist/mnist.npz') update_interval = 140 elif dataset == 'usps': # recommends: n_clusters=10, update_interval=30 x, y = load_usps('data/usps') update_interval = 30 # prepare the IDEC model elif dataset == "stl": import numpy as np x, y = load_stl() update_interval = 20 elif dataset == "cifar_10": x, y = load_cifar() update_interval = 140 batch_size = 120 print gamma, dataset try: count = Counter(y) except: count = Counter(y[:, 0]) n_clusters = len(count) save_dir = 'results/idec_dataset:' + dataset + " gamma:" + str(gamma) idec = IDEC(dims=[x.shape[-1], 500, 500, 2000, 10], n_clusters=n_clusters, batch_size=batch_size) idec.initialize_model(ae_weights=ae_weights, gamma=gamma, optimizer=optimizer) plot_model(idec.model, to_file='idec_model.png', show_shapes=True) idec.model.summary() # begin clustering, time not include pretraining part. t0 = time() y_pred = idec.clustering(x, y=y, tol=tol, maxiter=maxiter, update_interval=update_interval, save_dir=save_dir) print 'acc:', cluster_acc(y, y_pred) print 'clustering time: ', (time() - t0)
def main(): n_clusters = 10 # this is chosen based on prior knowledge of classes in the data set. batch_size = 256 lr = 0.01 # learning rate momentum = 0.9 # tolerance - if clustering stops if less than this fraction of the data changes cluster on an interation tol = 0.001 maxiter = 2e4 update_interval = 140 save_dir = './results/dec' x, y = load_mnist() #training_set_sizes = [100] training_set_sizes = [500, 1000, 5000, 10000, 50000] # prepare the DEC model dec = DEC(dims=[x.shape[-1], 500, 500, 2000, 10], n_clusters=n_clusters, batch_size=batch_size) for training_set_size in training_set_sizes: x_train = x[:training_set_size] y_train = y[:training_set_size] ae_weights = './ae_weights_m%d.h5' % training_set_size dec.initialize_model(optimizer=SGD(lr=lr, momentum=momentum), ae_weights=ae_weights, x=x_train) t0 = time() y_pred = dec.clustering(x_train, y=y_train, tol=tol, maxiter=maxiter, update_interval=update_interval, save_dir=save_dir + '/%d' % training_set_size) print('clustering time: ', (time() - t0)) print('acc:', cluster_acc(y_train, y_pred))
def clustering(self, x, y=None, tol=1e-3, update_interval=140, maxiter=2e4, save_dir='./results/idec'): print 'Update interval', update_interval save_interval = x.shape[0] / self.batch_size * 5 # 5 epochs print 'Save interval', save_interval # initialize cluster centers using k-means print 'Initializing cluster centers with k-means.' kmeans = KMeans(n_clusters=self.n_clusters, n_init=20) y_pred = kmeans.fit_predict(self.encoder.predict(x)) y_pred_last = y_pred self.model.get_layer(name='clustering').set_weights( [kmeans.cluster_centers_]) # logging file import csv, os if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = file(save_dir + '/idec_log.csv', 'wb') logwriter = csv.DictWriter( logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L', 'Lc', 'Lr']) logwriter.writeheader() loss = [0, 0, 0] index = 0 for ite in range(int(maxiter)): if ite % update_interval == 0: q, _ = self.model.predict(x, verbose=0) p = self.target_distribution( q) # update the auxiliary target distribution p # evaluate the clustering performance y_pred = q.argmax(1) delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] y_pred_last = y_pred if y is not None: acc = np.round(cluster_acc(y, y_pred), 5) nmi = np.round( metrics.normalized_mutual_info_score(y, y_pred), 5) ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5) loss = np.round(loss, 5) logdict = dict(iter=ite, acc=acc, nmi=nmi, ari=ari, L=loss[0], Lc=loss[1], Lr=loss[2]) logwriter.writerow(logdict) print 'Iter', ite, ': Acc', acc, ', nmi', nmi, ', ari', ari, '; loss=', loss # check stop criterion if ite > 0 and delta_label < tol: print 'delta_label ', delta_label, '< tol ', tol print 'Reached tolerance threshold. Stopping training.' logfile.close() break # train on batch if (index + 1) * self.batch_size > x.shape[0]: loss = self.model.train_on_batch( x=x[index * self.batch_size::], y=[ p[index * self.batch_size::], x[index * self.batch_size::] ]) index = 0 else: loss = self.model.train_on_batch( x=x[index * self.batch_size:(index + 1) * self.batch_size], y=[ p[index * self.batch_size:(index + 1) * self.batch_size], x[index * self.batch_size:(index + 1) * self.batch_size] ]) index += 1 # save intermediate model if ite % save_interval == 0: # save IDEC model checkpoints print 'saving model to:', save_dir + '/IDEC_model_' + str( ite) + '.h5' self.model.save_weights(save_dir + '/IDEC_model_' + str(ite) + '.h5') ite += 1 # save the trained model logfile.close() print 'saving model to:', save_dir + '/IDEC_model_final.h5' self.model.save_weights(save_dir + '/IDEC_model_final.h5') return y_pred
if args.dataset == 'mnist': # recommends: n_clusters=10, update_interval=140 x, y = load_mnist() optimizer = 'adam' elif args.dataset == 'usps': # recommends: n_clusters=10, update_interval=30 x, y = load_usps('data/usps') elif args.dataset == 'reutersidf10k': # recommends: n_clusters=4, update_interval=3 x, y = load_reuters('data/reuters') # prepare the IDEC model idec = IDEC(dims=[x.shape[-1], 500, 500, 2000, 10], n_clusters=args.n_clusters, batch_size=args.batch_size) idec.initialize_model(ae_weights=args.ae_weights, gamma=args.gamma, optimizer=optimizer) plot_model(idec.model, to_file='idec_model.png', show_shapes=True) idec.model.summary() # begin clustering, time not include pretraining part. t0 = time() y_pred = idec.clustering(x, y=y, tol=args.tol, maxiter=args.maxiter, update_interval=args.update_interval, save_dir=args.save_dir) print 'acc:', cluster_acc(y, y_pred) print 'clustering time: ', (time() - t0)
def fit(self, x, y=None, batch_size=256, maxiter=2e4, tol=1e-3, update_interval=140, ae_weights=None, save_dir='./results/idec'): print('Update interval', update_interval) save_interval = int(x.shape[0] / batch_size) * 5 # 5 epochs save_interval = 50 print('Save interval', save_interval) # Step 1: pretrain if not self.pretrained and ae_weights is None: print( '...pretraining autoencoders using default hyper-parameters:') print(' optimizer=\'adam\'; epochs=200') self.pretrain(x, batch_size) self.pretrained = True elif ae_weights is not None: self.autoencoder.load_weights(ae_weights) print('ae_weights is loaded successfully.') # Step 2: initialize cluster centers using k-means print('Initializing cluster centers with k-means.') kmeans = KMeans(n_clusters=self.n_clusters, n_init=4) self.y_pred = kmeans.fit_predict(self.encoder.predict(x)) y_pred_last = np.copy(self.y_pred) self.model.get_layer(name='clustering').set_weights( [kmeans.cluster_centers_]) # Step 3: deep clustering # logging file import csv, os if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = open(save_dir + '/idec_log.csv', 'w') logwriter = csv.DictWriter( logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L', 'Lc', 'Lr']) logwriter.writeheader() loss = [0, 0, 0] index = 0 for ite in range(int(maxiter)): if ite % update_interval == 0: q, _ = self.model.predict(x, verbose=0) p = self.target_distribution( q) # update the auxiliary target distribution p # evaluate the clustering performance self.y_pred = q.argmax(1) if y is not None: acc = np.round(cluster_acc(y, self.y_pred), 5) nmi = np.round( metrics.normalized_mutual_info_score(y, self.y_pred), 5) ari = np.round(metrics.adjusted_rand_score(y, self.y_pred), 5) loss = np.round(loss, 5) logwriter.writerow( dict(iter=ite, acc=acc, nmi=nmi, ari=ari, L=loss[0], Lc=loss[1], Lr=loss[2])) print( 'Iter-%d: ACC= %.4f, NMI= %.4f, ARI= %.4f; L= %.5f, Lc= %.5f, Lr= %.5f' % (ite, acc, nmi, ari, loss[0], loss[1], loss[2])) # check stop criterion delta_label = np.sum(self.y_pred != y_pred_last).astype( np.float32) / self.y_pred.shape[0] y_pred_last = np.copy(self.y_pred) if ite > 0 and delta_label < tol: print('delta_label ', delta_label, '< tol ', tol) print('Reached tolerance threshold. Stopping training.') logfile.close() break # train on batch if (index + 1) * batch_size > x.shape[0]: loss = self.model.train_on_batch( x=x[index * batch_size::], y=[p[index * batch_size::], x[index * batch_size::]]) index = 0 else: loss = self.model.train_on_batch( x=x[index * batch_size:(index + 1) * batch_size], y=[ p[index * batch_size:(index + 1) * batch_size], x[index * batch_size:(index + 1) * batch_size] ]) index += 1 # save intermediate model if ite % save_interval == 0: # save IDEC model checkpoints print('saving model to: ' + save_dir + '/IDEC_model_' + str(ite) + '.h5') self.model.save_weights(save_dir + '/IDEC_model_' + str(ite) + '.h5') ite += 1 # save the trained model logfile.close() print('saving model to: ' + save_dir + '/IDEC_model_final.h5') self.model.save_weights(save_dir + '/IDEC_model_final.h5') return self.y_pred
idec.compile(loss=['kld', 'mse'], loss_weights=[args.gamma, 1], optimizer=optimizer) idec.fit(x, y=y, batch_size=args.batch_size, tol=args.tol, maxiter=args.maxiter, update_interval=args.update_interval, ae_weights=args.ae_weights, save_dir=args.save_dir) # Show the final results y_pred = idec.y_pred print(y_pred) print('acc:', cluster_acc(y, y_pred)) print('clustering time: %d seconds.' % int(time() - t0)) embed = idec.encoder.predict(x) year = 1999 emd_file = open("deep_embedding", 'w') for embedding in embed: emd_file.write(str(year) + ':') for vals in embedding: emd_file.write('\t' + str(vals)) emd_file.write('\n') year += 1 emd_file.close() year = 1999 cl_file = open("deep_clustering", 'w')
def main(): # constants batch_size = 256 lr = 0.01 momentum = 0.9 tol = 0.001 maxiter = 2e4 update_interval = 140 n_clusters = 10 n_classes = 10 lcolours = ['#D6FF79', '#B0FF92', '#A09BE7', '#5F00BA', '#56CBF9', \ '#F3C969', '#ED254E', '#CAA8F5', '#D9F0FF', '#46351D'] labels = [str(i) for i in range(n_clusters)] ae_weights = '../../../../DEC-keras/results/mnist/ae_weights.h5' dec_weights = '../../../../DEC-keras/results/mnist/%d/DEC_model_final.h5' % n_clusters # load mnist data set x, y = load_mnist() # split the data into training, validation and test sets m = x.shape[0] m = m - 20000 sample_frac = 0.01 split = int(sample_frac * m) print(split) x_train = x[:split] y_train = y[:split] x_valid = x[50000:60000] y_valid = y[50000:60000] x_test = x[60000:] y_test = y[60000:] # load pretrained DEC model dec = load_mnist_dec(x, ae_weights, dec_weights, n_clusters, \ batch_size, lr, momentum) # predict training set cluster assignments y_pred = dec.predict_clusters(x_train) # inspect the clustering and simulate volunteer labelling of random sample (the training set) cluster_to_label_mapping, n_assigned_list, majority_class_fractions = \ get_cluster_to_label_mapping(y_train, y_pred, n_classes, n_clusters) print(cluster_acc(y_train, y_pred)) y_valid_pred = dec.predict_clusters(x_valid) print(cluster_acc(y_valid, y_valid_pred)) # extract the cluster centres cluster_centres = get_cluster_centres(dec) # determine current unlabelled samples y_plot = np.array(y[:m], dtype='int') y_plot[split:] = -1 # reduce embedding to 2D and plot labelled and unlabelled training set samples #pca_plot(dec.encoder, x[:m], cluster_centres, y=y_plot, labels=labels, \ # lcolours=lcolours) # get siamese training pairs im, cc, ls, cluster_to_label_mapping = \ get_pairs_auto(dec, x_train, y_train, cluster_centres, \ cluster_to_label_mapping, majority_class_fractions, n_clusters) #im, cc, ls, cluster_to_label_mapping = \ # get_pairs_auto_with_noise(dec, x_train, y_train, cluster_centres, \ # cluster_to_label_mapping, majority_class_fractions, n_clusters) """ mcheckpointer = ModelCheckpoint(filepath='saved_models/weights.best..hdf5', \ verbose=1, save_best_only=True) base_network = Model(dec.model.input, \ dec.model.get_layer('encoder_%d' % (dec.n_stacks - 1)).output) fcheckpointer = FrameDumpCallback(base_network, x, cluster_centres, \ './video', y=y_plot, labels=labels, lcolours=lcolours) """ #callbacks = [mcheckpointer, fcheckpointer] callbacks = [] model, base_network = train_siamese(dec, cluster_centres, im, cc, ls, \ epochs=5, split_frac=0.75, callbacks=callbacks) #model, base_network = train_siamese_online(dec, x, cluster_centres, im, cc, ls, \ # epochs=1, split_frac=0.75, callbacks=[]) y_pred = dec.predict_clusters(x_valid) cluster_to_label_mapping, n_assigned_list, majority_class_fractions = \ get_cluster_to_label_mapping(y_valid, y_pred, n_classes, n_clusters) print(cluster_acc(y_valid, y_pred)) #pca_plot(dec.encoder, x_valid, cluster_centres, y=y_valid, labels=labels, \ # lcolours=lcolours) y_pred = dec.predict_clusters(x[:m]) print(np.argmin(majority_class_fractions)) for j in range(1, 6): selection = np.where( y_pred[j * split:(j + 1) * split] == np.argmin(majority_class_fractions)) x_train = np.concatenate( (x_train, x[:m][j * split:(j + 1) * split][selection])) y_train = np.concatenate( (y_train, y[:m][j * split:(j + 1) * split][selection])) im, cc, ls, cluster_to_label_mapping = \ get_pairs_auto(dec, x_train, y_train, cluster_centres, \ cluster_to_label_mapping, majority_class_fractions, n_clusters) callbacks = [] model, base_network = train_siamese(dec, cluster_centres, im, cc, ls, \ epochs=1, split_frac=0.75, callbacks=callbacks) #x_train = x[:2*split] #y_train = y[:2*split] #y_pred = dec.predict_clusters(x_train) #cluster_to_label_mapping, n_assigned_list, majority_class_fractions = \ # get_cluster_to_label_mapping(y_train, y_pred, n_classes, n_clusters) y_pred = dec.predict_clusters(x_valid) cluster_to_label_mapping, n_assigned_list, majority_class_fractions = \ get_cluster_to_label_mapping(y_valid, y_pred, n_classes, n_clusters) print(cluster_acc(y_valid, y_pred))