def update_target_distribution(self,valid_loader,tol): data = [] labels = [] use_cuda = torch.cuda.is_available() if use_cuda: self.cuda() for batch_idx, (inputs, tar,_) in enumerate(valid_loader): if use_cuda: inputs = inputs.cuda() _, tmp_q, _ = self.forward(inputs) data.append(tmp_q.data) labels.append(tar.cpu().numpy()) tmp_q = torch.cat(data) labels = np.concatenate(labels) self.prop = self.target_distribution(tmp_q) #evaluate clustering performance y_pred = tmp_q.cpu().numpy().argmax(1) labels_changed = np.sum(y_pred != self.y_pred_last).astype( np.float32) / y_pred.shape[0] self.y_pred_last = y_pred if labels_changed < tol: self.convergence_iter+=1 else: self.convergence_iter = 0 return labels_changed, cluster_acc(labels,y_pred)[0]
def __initialize_models(self, feat, labels=None): self.data_size = feat.shape[0] self.feat_dim = feat.shape[1] if self.verbose: print('Pretraining Cluster Centers by KMeans') self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=20, n_jobs=self.max_jobs, verbose=False) self.last_pred = self.kmeans.fit_predict(feat) if labels is not None: tmp_acc = cluster_acc(labels, self.last_pred) if self.verbose: print('KMeans acc is {}'.format(tmp_acc)) if self.verbose: print('Building Cluster Layer') # self.cluster_layer = ClusterNet(torch.Tensor(self.kmeans.cluster_centers_.astype(np.float32))) self.cluster_layer = ClusterNet(torch.from_numpy(self.kmeans.cluster_centers_.astype(np.float32))) if self.use_cuda: self.cluster_layer.cuda() if self.verbose: print('Building Optimizer') self.optimizer = optim.Adam(self.cluster_layer.parameters(), lr=self.lr)
def clustering_creation(df_final, target_column, dataset): X = df_final.loc[:, df_final.columns != target_column] Y = df_final.loc[:, df_final.columns == target_column] clusters = np.linspace(2, len(X.columns), 3, dtype=np.int64, endpoint=True) SSE = defaultdict(dict) ll = defaultdict(lambda: defaultdict(dict)) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) SS = defaultdict(lambda: defaultdict(dict)) SSS = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(X) gmm.fit(X) SSE[k][dataset] = km.score(X) ll[k][dataset]['AIC'] = gmm.aic(X) ll[k][dataset]['BIC'] = gmm.bic(X) SS[k][dataset]['Kmeans'] = cluster_silhouette_score(X, km.predict(X)) SS[k][dataset]['GMM'] = cluster_silhouette_score(X, gmm.predict(X)) SSS[k][dataset]['Kmeans'] = cluster_sample_silhouette_score( X, km.predict(X)) SSS[k][dataset]['GMM'] = cluster_sample_silhouette_score( X, gmm.predict(X)) acc[k][dataset]['Kmeans'] = cluster_acc(Y, km.predict(X)) acc[k][dataset]['GMM'] = cluster_acc(Y, gmm.predict(X)) adjMI[k][dataset]['Kmeans'] = ami(Y.squeeze(1), km.predict(X)) adjMI[k][dataset]['GMM'] = ami(Y.squeeze(1), gmm.predict(X)) print(k) cluster_labels_km = km.predict(X) cluster_labels_gm = gmm.predict(X) plot_silhouette_score(X, SS, SSS, k, dataset, cluster_labels_km, cluster_labels_gm) plot_cluster_accuracy(dataset, acc, clusters) plot_cluster_information(dataset, adjMI, clusters) KMeans_ELBOW(dataset, SSE, clusters) BICandAIC(dataset, ll, clusters)
def fit(self, feat, labels=None): self.__initialize_models(feat, labels=labels) self.__update_target_distribute(feat) if self.verbose: print('Begin to Iterate') index = 0 for ite in range(int(self.maxiter)): if ite % self.update_interval == (self.update_interval - 1): self.__update_target_distribute(feat) tmp_pred_cur = self.__get_label_pred(self.current_q) acc = None if labels is not None: acc = cluster_acc(labels, tmp_pred_cur) if self.logger is not None: self.logger.record_acc(acc, ite) if self.verbose: if acc is not None: print('Iter {} Acc {}'.format(ite,acc)) else: print('Update Target Distribution in Iter {}'.format(ite)) if ite > 0 and self.__whether_convergence(tmp_pred_cur, self.last_pred): break self.last_pred = tmp_pred_cur if index + self.batch_size > self.data_size: feat_batch = feat[index:] p_batch = self.current_p[index:] index = 0 else: feat_batch = feat[index: index + self.batch_size] p_batch = self.current_p[index: index + self.batch_size] feat_batch = Variable(torch.from_numpy(feat_batch.astype(np.float32))) p_batch = Variable(torch.from_numpy(p_batch.astype(np.float32))) if self.use_cuda: feat_batch = feat_batch.cuda() p_batch = p_batch.cuda() self.cluster_layer.zero_grad() q_batch = self.cluster_layer(feat_batch) cluster_loss = F.binary_cross_entropy(q_batch, p_batch) if self.logger is not None: self.logger.record_loss(cluster_loss.data[0], ite) cluster_loss.backward() self.optimizer.step()
def update_cluster_acc(self): from sklearn.metrics import normalized_mutual_info_score from sklearn.metrics import adjusted_mutual_info_score self.current_cluster_acc = cluster_acc(np.array(self.corpus_loader.train_labels), self.current_pred_labels) self.current_cluster_nmi = normalized_mutual_info_score(np.array(self.corpus_loader.train_labels), self.current_pred_labels) self.current_cluster_ari = adjusted_mutual_info_score(np.array(self.corpus_loader.train_labels), self.current_pred_labels)
def run_eval(config): """Runs the evaluation of a deep generative model. Args: config: A configuration object with config values accessible as properties. """ # Set the random seed for shuffling and sampling tf.random.set_seed(config.random_seed) gpus = tf.config.experimental.list_physical_devices('GPU') # Extract integer GPU IDs gpu_ids = list(map(int, config.gpu_num.split(','))) # Set the accessible GPUs for training try: for i in gpu_ids: tf.config.experimental.set_memory_growth(gpus[i], True) tf.config.experimental.set_visible_devices(gpus[i], 'GPU') logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: print(e) if config.dataset == 'moving_mnist': logging.info("Loading the Moving MNIST dataset...") dataset, train_mu = create_moving_mnist(config, split=config.split, shuffle=False) # Convert training set mean to logit space for bias initialisation of generative model gen_bias_init = -tf.math.log( 1. / tf.clip_by_value(train_mu, 0.0001, 0.9999) - 1) data_type = 'binary' elif config.dataset == 'sprites': logging.info("Loading the Sprites dataset...") _, dataset, _ = create_lpc_sprites(config, shuffle=False) train_mu = tf.zeros( [config.patch_size, config.patch_size, config.num_channels], dtype=np.float32) gen_bias_init = 0.0 data_type = 'real' logging.info("Constructing the unsupervised generative model...") model = create_model(config, gen_bias_init, data_type) # Set up log directory for loading the pre-trained model logdir = '{}/{}/train/{}/h{}_r{}_f{}_z{}/run{}'.format( config.logdir, config.dataset, config.model, config.hidden_size, config.rnn_size, config.latent_size, config.dynamic_latent_size, config.random_seed) if not tf.io.gfile.exists(logdir): logging.error("No directory {}".format(logdir)) sys.exit(1) # Checkpoint management ckpt = tf.train.Checkpoint(model=model, epoch=tf.Variable(0, trainable=False, dtype=tf.int64)) manager = tf.train.CheckpointManager(ckpt, directory=logdir, max_to_keep=5) ckpt.restore(manager.latest_checkpoint).expect_partial() if manager.latest_checkpoint: logging.info("Successfully restored from {}".format( manager.latest_checkpoint)) step = int(ckpt.epoch) logging.info("At epoch: {}".format(step)) else: logging.error("Failed to restore the model checkpoint.") sys.exit(1) # Summary directory for evaluation results summary_dir = '{}/{}/{}/{}/h{}_r{}_f{}_z{}/run{}'.format( config.logdir, config.dataset, config.split, config.model, config.hidden_size, config.rnn_size, config.latent_size, config.dynamic_latent_size, config.random_seed) qualitative_dir = summary_dir + '/qualitative_results' if not tf.io.gfile.exists(qualitative_dir): tf.io.gfile.makedirs(qualitative_dir) # Create summary writer summary_writer = tf.summary.create_file_writer(summary_dir) summary_writer.set_as_default() # Boolean flags to switch between models is_clustering = (config.model == 'discvae') or (config.model == 'gmvae') logging.info("Clustering? {}".format(is_clustering)) is_predictive = (config.model == 'discvae') or (config.model == 'vrnn') logging.info("Predictive? {}".format(is_predictive)) # Evaluation metrics elbo = tf.keras.metrics.Mean(name='elbo', dtype=tf.float32) bce_loss = torch.nn.BCELoss() mse_loss = torch.nn.MSELoss() bce_results = [] mse_results = [] latents = [] predictions = [] labels = [] # Loop over dataset for a single epoch for imgs, tgts, lens, labs_all in dataset: # Compute bound estimates if config.model == 'discvae': elbo_per_batch, infer_c, latent_per_batch, _, _ = model.run_model( imgs, tgts, lens, num_samples=config.num_samples) predictions.extend(infer_c) # Sample from the inferred clusters of the GMM prior_f, _ = model.sample_static_prior( infer_c, num_samples=config.num_samples) elif config.model == 'vrnn': elbo_per_batch, latent_per_batch, _, _ = model.run_model( imgs, tgts, lens, num_samples=config.num_samples) elif config.model == 'gmvae': elbo_per_batch, infer_c, _, _ = model.run_model( tgts, num_samples=config.num_samples) predictions.extend(infer_c) latent_per_batch, _ = model.sample_prior( infer_c, num_samples=config.num_samples) else: elbo_per_batch, latent_per_batch, _ = model.run_model( tgts, num_samples=config.num_samples) # Mean integration of MC samples latent = tf.reduce_mean(latent_per_batch, axis=0) latents.extend(latent) # Extend labels for all models labels.extend(labs_all) # Update elbo metric elbo.update_state(elbo_per_batch) # Future prediction evaluation if relevant if is_predictive: input_prefixes = imgs[:config.prefix_length] target_prefixes = tgts[:config.prefix_length] prefix_lengths = tf.ones_like(lens) * config.prefix_length sample_inputs = imgs[config.prefix_length] # Run model on prefix input sequences and then conditionally sample forward in time if config.model == 'discvae': _, _, prefix_f, final_state, _ = model.run_model( input_prefixes, target_prefixes, prefix_lengths, num_samples=config.num_samples) # (sample_length, num_samples, batch_size, patch_size, patch_size, num_channels) forecasts = model.sample_model( sample_inputs, final_state, inject_f=prefix_f, sample_length=config.sample_length, train_mu=train_mu) elif config.model == 'vrnn': _, _, final_state, _ = model.run_model( input_prefixes, target_prefixes, prefix_lengths, num_samples=config.num_samples) # (sample_length, num_samples, batch_size, patch_size, patch_size, num_channels) forecasts = model.sample_model( sample_inputs, final_state, sample_length=config.sample_length, train_mu=train_mu) # (sample_length, batch_size, patch_size, patch_size, num_channels) forecasts = tf.reduce_mean(forecasts, axis=1) ground_truth = tgts[config.prefix_length:config.prefix_length + config.sample_length] forecasts_torch = torch.from_numpy(np.array(forecasts)) ground_truth_torch = torch.from_numpy(np.array(ground_truth)) mse_score = mse_loss(forecasts_torch, ground_truth_torch) eps = 1e-4 forecasts_torch[forecasts_torch < eps] = eps forecasts_torch[forecasts_torch > 1 - eps] = 1 - eps bce_score = bce_loss(forecasts_torch, ground_truth_torch) bce_score = bce_score.item( ) * config.patch_size * config.patch_size * config.num_channels mse_score = mse_score.item( ) * config.patch_size * config.patch_size * config.num_channels bce_results.append(bce_score) mse_results.append(mse_score) latents_np = np.array(latents) labels_np = np.array(labels) primary_labels = labels_np[:, 0] logging.info("Plotting latent code for inferred latent variable...") latent_two = utils.reduce_dimensionality(latents_np) if config.dataset == 'moving_mnist': utils.tsne_visualise(qualitative_dir, step, latent_two, primary_labels, num_colours=10) else: utils.tsne_visualise(qualitative_dir, step, latent_two, primary_labels, num_colours=9) # Save summaries following evaluation over 'split' set tf.summary.scalar(config.split + '/elbo', elbo.result(), step=ckpt.epoch) # If a clustering model then report on the metric if is_clustering: predictions_np = np.array(predictions) test_acc = utils.cluster_acc(predictions_np, primary_labels) tf.summary.scalar(config.split + '/acc', test_acc * 100, step=ckpt.epoch) test_nmi = utils.compute_NMI(predictions_np, primary_labels) tf.summary.scalar(config.split + '/nmi', test_nmi, step=ckpt.epoch) # If a predictive model then report on relevant metrics if is_predictive: tf.summary.scalar(config.split + '/bce', np.mean(bce_results), step=ckpt.epoch) tf.summary.scalar(config.split + '/mse', np.mean(mse_results), step=ckpt.epoch) # Perform full qualitiative analysis only if the model is DiSCVAE if config.model == 'discvae': logging.info( "Plotting density estimates of component samples from model prior..." ) component_f, learnt_prior = model.sample_static_prior(num_samples=250) flattened_component_f = tf.reshape(component_f, [-1, config.latent_size]) component_f_two = utils.reduce_dimensionality(flattened_component_f) utils.plot_density(qualitative_dir, step, component_f_two) # Create plots of sampled states from fixed 'f' samples for imgs, tgts, lens, _ in dataset.take(1): # Take random batch example but maintain batch dimension rand_batch = np.random.randint(config.batch_size) inputs = tf.expand_dims(imgs[:, rand_batch], axis=1) targets = tf.expand_dims(tgts[:, rand_batch], axis=1) # Run model through this single batched prefix sequence input_prefixes = inputs[:config.prefix_length] target_prefixes = targets[:config.prefix_length] _, infer_c, prefix_f, final_state, _ = model.run_model( input_prefixes, target_prefixes, [config.prefix_length]) # Sample forward from model to obtain conditionally generated predictions sample_inputs = inputs[config.prefix_length] ground_truth = targets[config.prefix_length:config.prefix_length + config.sample_length] # (sample_length, 1, batch_size, patch_size, patch_size, num_channels) forecast_samples = model.sample_model( sample_inputs, final_state, inject_f=prefix_f, sample_length=config.sample_length, train_mu=train_mu) # Extract mean of 'num_samples' from each cluster (1, K, latent_size) inject_f = tf.reduce_mean(component_f[:config.num_samples], axis=0, keepdims=True) # Reshape to have single batch size (1, 1, K, latent_size) inject_f = tf.expand_dims(inject_f, axis=1) # Sampled states from each cluster inject_samples = [None] * config.mixture_components for k in range(config.mixture_components): inject_samples[k] = model.sample_model( sample_inputs, final_state, inject_f=inject_f[:, :, k], sample_length=config.sample_length, train_mu=train_mu).numpy() logging.info( "Plotting sampled sequence ground truth and forecasts...") utils.plot_video_sequence(qualitative_dir, step, target_prefixes.numpy(), name='prefixes') utils.plot_video_sequence(qualitative_dir, step, ground_truth.numpy(), name='ground_truth') utils.plot_video_sequence(qualitative_dir, step, forecast_samples.numpy(), name='forecasts') logging.info( "Plotting forecasted states of fixed samples from each cluster..." ) utils.plot_k_samples(qualitative_dir, step, ground_truth.numpy(), inject_samples, infer_c[0].numpy(), num_k_display=config.mixture_components) # Feature swapping and fixing for reconstruction only recons = model.reconstruct(imgs, tgts, lens, num_samples=config.num_samples) swapped_f = model.reconstruct(imgs, tgts, lens, swap_f=True, num_samples=config.num_samples) swapped_z = model.reconstruct(imgs, tgts, lens, swap_z=True, num_samples=config.num_samples) logging.info("Plotting reconstructions and swapped features...") utils.plot_batch_sequence(qualitative_dir, step, tgts.numpy(), name='original') utils.plot_batch_sequence(qualitative_dir, step, recons.numpy(), name='reconstructions') utils.plot_batch_sequence(qualitative_dir, step, swapped_f.numpy(), name='swapped_f') utils.plot_batch_sequence(qualitative_dir, step, swapped_z.numpy(), name='swapped_z') # Force flush the summary writer after testing summary_writer.flush()
def run_train(config): """Runs the training of a deep generative model. Args: config: A configuration object with config values accessible as properties. """ # Set the random seed for shuffling and sampling tf.random.set_seed(config.random_seed) gpus = tf.config.experimental.list_physical_devices('GPU') # Extract integer GPU IDs gpu_ids = list(map(int, config.gpu_num.split(','))) # Set the accessible GPUs for training try: for i in gpu_ids: tf.config.experimental.set_memory_growth(gpus[i], True) tf.config.experimental.set_visible_devices(gpus[i], 'GPU') logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: print(e) if config.dataset == 'moving_mnist': logging.info("Loading the Moving MNIST dataset...") train_ds, train_mu = create_moving_mnist(config, split='train', shuffle=True) test_ds, _ = create_moving_mnist(config, split=config.split, shuffle=False) # Convert training set mean to logit space for bias initialisation of generative model gen_bias_init = -tf.math.log( 1. / tf.clip_by_value(train_mu, 0.0001, 0.9999) - 1) data_type = 'binary' elif config.dataset == 'sprites': logging.info("Loading the Sprites dataset...") train_ds, test_ds, _ = create_lpc_sprites(config, shuffle=True) gen_bias_init = 0.0 data_type = 'real' logging.info("Constructing the unsupervised generative model...") model = create_model(config, gen_bias_init, data_type) # Set up the optimiser opt = tf.keras.optimizers.Adam(config.learning_rate, clipnorm=config.clip_norm) # Set up log directory for saving checkpoints logdir = '{}/{}/train/{}/h{}_r{}_f{}_z{}/run{}'.format( config.logdir, config.dataset, config.model, config.hidden_size, config.rnn_size, config.latent_size, config.dynamic_latent_size, config.random_seed) if not tf.io.gfile.exists(logdir): logging.info("Creating log directory at {}".format(logdir)) tf.io.gfile.makedirs(logdir) # Checkpoint management ckpt = tf.train.Checkpoint(model=model, epoch=tf.Variable(0, trainable=False, dtype=tf.int64), step=tf.Variable(0, trainable=False, dtype=tf.int64), optimizer=opt) manager = tf.train.CheckpointManager(ckpt, directory=logdir, max_to_keep=5) ckpt.restore(manager.latest_checkpoint) if manager.latest_checkpoint: logging.info("Restored from {}".format(manager.latest_checkpoint)) else: logging.info("Initialising from scratch...") # Create summary writer summary_writer = tf.summary.create_file_writer(logdir + '/summaries') summary_writer.set_as_default() # Boolean flags to switch between models is_clustering = (config.model == 'discvae') or (config.model == 'gmvae') logging.info("Clustering? {}".format(is_clustering)) # Training aggregate metrics train_elbo = tf.keras.metrics.Mean(name='train_elbo', dtype=tf.float32) test_elbo = tf.keras.metrics.Mean(name=config.split + '_elbo', dtype=tf.float32) for i in range(config.num_epochs): # Reset the metrics at the start of the next epoch train_elbo.reset_states() test_elbo.reset_states() # Lists for predictions and labels train_predictions = [] train_labels = [] test_predictions = [] test_labels = [] # Loop over training set for imgs, tgts, lens, labs in train_ds: with tf.GradientTape() as tape: # Run the model to compute the ELBO objective and reconstructions if config.model == 'discvae': elbo, infer_c, _, _, recons = model.run_model( imgs, tgts, lens, ckpt.step, num_samples=config.num_samples) train_predictions.extend(infer_c) train_labels.extend(labs) elif config.model == 'vrnn': elbo, _, _, recons = model.run_model( imgs, tgts, lens, ckpt.step, num_samples=config.num_samples) elif config.model == 'gmvae': elbo, infer_c, _, recons = model.run_model( tgts, ckpt.step, num_samples=config.num_samples) train_predictions.extend(infer_c) train_labels.extend(labs) else: elbo, _, recons = model.run_model( tgts, ckpt.step, num_samples=config.num_samples) # Compute gradients of operations with respect to model variables grads = tape.gradient(-elbo, model.variables) # Maximise ELBO objective opt.apply_gradients(list(zip(grads, model.variables))) # Update metrics train_elbo.update_state(elbo) if (ckpt.step % config.summarise_every == 0): # Transpose for summary visualisations inputs_viz = tf.transpose(tgts, perm=[1, 0, 2, 3, 4]) recons_viz = tf.transpose(recons, perm=[1, 0, 2, 3, 4]) # Only take 4 example reconstructions combined = tf.concat((inputs_viz[:4], recons_viz[:4]), axis=0) utils.image_seq_summary(combined, 'reconstructions', step=ckpt.step) # Increment global step ckpt.step.assign_add(1) # Loop over test set for imgs, tgts, lens, labs in test_ds: # Acquire test set metrics from computed loss tensors if config.model == 'discvae': elbo, infer_c, _, _, _ = model.run_model( imgs, tgts, lens, num_samples=config.num_samples) test_predictions.extend(infer_c) test_labels.extend(labs) elif config.model == 'vrnn': elbo, _, _, _ = model.run_model(imgs, tgts, lens, num_samples=config.num_samples) elif config.model == 'gmvae': elbo, infer_c, _, _ = model.run_model( tgts, num_samples=config.num_samples) test_predictions.extend(infer_c) test_labels.extend(labs) else: elbo, _, _ = model.run_model(tgts, num_samples=config.num_samples) test_elbo.update_state(elbo) # Logging phase if is_clustering: train_predictions_np = np.array(train_predictions) train_labels_np = np.array(train_labels) train_acc = utils.cluster_acc(train_predictions_np, train_labels_np[:, 0]) test_predictions_np = np.array(test_predictions) test_labels_np = np.array(test_labels) test_acc = utils.cluster_acc(test_predictions_np, test_labels_np[:, 0]) template = "Epoch {:d}, ELBO: {:.2f}, Test ELBO: {:.2f}, Acc: {:.2f}, Test Acc: {:.2f}" aggreg_results = [ train_elbo.result(), test_elbo.result(), train_acc * 100, test_acc * 100 ] print( template.format(int(ckpt.epoch), aggreg_results[0], aggreg_results[1], aggreg_results[2], aggreg_results[3])) else: template = "Epoch {:d}, ELBO: {:.2f}, Test ELBO: {:.2f}" aggreg_results = [train_elbo.result(), test_elbo.result()] print( template.format(int(ckpt.epoch), aggreg_results[0], aggreg_results[1])) # Save aggregate summaries for logging stage with tf.name_scope('aggregates'): tf.summary.scalar('train_elbo', aggreg_results[0], step=ckpt.epoch) tf.summary.scalar(config.split + '_elbo', aggreg_results[1], step=ckpt.epoch) if is_clustering: tf.summary.scalar('train_acc', aggreg_results[2], step=ckpt.epoch) tf.summary.scalar(config.split + '_acc', aggreg_results[3], step=ckpt.epoch) # Checkpoint phase is_final_epoch = ((i + 1) == config.num_epochs) is_save_epoch = (i % config.save_every == 0) if is_save_epoch or is_final_epoch: save_path = manager.save() print("Saving checkpoint for step {}: {}".format( int(ckpt.step), save_path)) # Increment epoch ckpt.epoch.assign_add(1) # Force flush the summary writer during training summary_writer.flush()
def train_idec(): model = IDEC(n_enc_1=500, n_enc_2=500, n_enc_3=1000, n_dec_1=1000, n_dec_2=500, n_dec_3=500, n_input=args.n_input, n_z=args.n_z, n_clusters=args.n_clusters, alpha=1.0, pretrain_path=args.pretrain_path).to(device) # model.pretrain('data/ae_mnist.pkl') model.pretrain() train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False) optimizer = Adam(model.parameters(), lr=args.lr) # cluster parameter initiate data = dataset.x y = dataset.y data = torch.Tensor(data).to(device) x_bar, hidden = model.ae(data) kmeans = KMeans(n_clusters=args.n_clusters, n_init=20) y_pred = kmeans.fit_predict(hidden.data.cpu().numpy()) nmi_k = nmi_score(y_pred, y) print("nmi score={:.4f}".format(nmi_k)) hidden = None x_bar = None y_pred_last = y_pred model.cluster_layer.data = torch.tensor(kmeans.cluster_centers_).to(device) model.train() for epoch in range(100): if epoch % args.update_interval == 0: _, tmp_q = model(data) # update target distribution p tmp_q = tmp_q.data p = target_distribution(tmp_q) # evaluate clustering performance y_pred = tmp_q.cpu().numpy().argmax(1) delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] y_pred_last = y_pred acc = cluster_acc(y, y_pred) nmi = nmi_score(y, y_pred) ari = ari_score(y, y_pred) print('Iter {}'.format(epoch), ':Acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari)) if epoch > 0 and delta_label < args.tol: print('delta_label {:.4f}'.format(delta_label), '< tol', args.tol) print('Reached tolerance threshold. Stopping training.') break for batch_idx, (x, _, idx) in enumerate(train_loader): x = x.to(device) idx = idx.to(device) x_bar, q = model(x) reconstr_loss = F.mse_loss(x_bar, x) kl_loss = F.kl_div(q.log(), p[idx]) loss = args.gamma * kl_loss + reconstr_loss optimizer.zero_grad() loss.backward() optimizer.step()
logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L']) logwriter.writeheader() loss = 0 idx = 0 t0 = time() for ite in range(int(args.maxiter)): if ite % args.update_interval == 0: q = model.predict_generator(AE_generator, verbose=1) p = target_distribution( q) # update the auxiliary target distribution p print(p.shape) # evaluate the clustering performance y_pred = q.argmax(1) if y_true is not None: acc = np.round(cluster_acc(y_true, y_pred), 5) nmi = np.round( metrics.normalized_mutual_info_score( y_true, y_pred), 5) ari = np.round( metrics.adjusted_rand_score(y_true, y_pred), 5) loss = np.round(loss, 5) logwriter.writerow( dict(iter=ite, acc=acc, nmi=nmi, ari=ari, L=loss)) print( 'Iter-%d: ACC= %.4f, NMI= %.4f, ARI= %.4f; L= %.5f' % (ite, acc, nmi, ari, loss)) # check stop criterion # When delta_label ==0 is because y_pred = y_pred_last. (It was no improvement) delta_label = np.sum(y_pred != y_pred_last).astype(
def fit(self, trainloader, validloader, path, lr=0.001, num_epochs=10, anneal=False, tol=0.0005): labels_changed = 1 use_cuda = torch.cuda.is_available() if use_cuda: self.cuda() print("=====Fitting the model......Patience=======") optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr) train_error = [] for epoch in range(num_epochs): # train 1 epoch self.train() if anneal: epoch_lr = adjust_learning_rate(lr, optimizer, epoch) train_loss = 0 loop = tqdm(trainloader) for batch_idx, (inputs, _, _) in enumerate(loop): if use_cuda: inputs = inputs.cuda() optimizer.zero_grad() inputs = Variable(inputs) z, outputs, mu, logvar = self.forward(inputs) loss = self.loss_function(outputs, inputs, z, mu, logvar) train_loss += loss.item() loss.backward() optimizer.step() self.eval() Y = [] Y_pred = [] probabilities = [] for batch_idx, (inputs, labels, _) in enumerate(validloader): if use_cuda: inputs = inputs.cuda() inputs = Variable(inputs) z, outputs, mu, logvar = self.forward(inputs) q_c_x = self.compute_gamma(z).data.cpu().numpy() probabilities.append(q_c_x) Y.append(labels.numpy()) Y_pred.append(np.argmax(q_c_x, axis=1)) Y = np.concatenate(Y) Y_pred = np.concatenate(Y_pred) if epoch != 0: labels_changed = np.sum(Y_pred != self.Y_pred_last).astype( np.float32) / Y_pred.shape[0] self.Y_pred_last = Y_pred if labels_changed < tol: self.convergence_iter += 1 else: self.convergence_iter = 0 acc = cluster_acc(Y_pred, Y) # valid_loss = total_loss / total_num print("#Epoch %3d: lr: %.5f, Train Loss: %.5f, acc: %.5f" % (epoch, epoch_lr, train_loss / len(trainloader.dataset), acc[0])) train_error.append(train_loss / len(trainloader.dataset)) self.save_model(path) if self.convergence_iter >= 5: print( 'percentage of labels changed {:.4f}'.format( labels_changed), '< tol', tol) print('Reached Convergence threshold. Stopping training.') break
np.where(specs.test_indices[i] == indices)[0][0] for i in range(len(specs.test_indices)) ] # Select only the labels which are to be used in the evaluation (disjoint for validation and test) validation_target = np.asarray([target[i] for i in validation_indices]) test_target = np.asarray([target[i] for i in test_indices]) # Split the cluster assignments for validation and test sets validation_cluster_assign = np.asarray( [kmeans_model.labels_[i] for i in validation_indices]) test_cluster_assign = np.asarray( [kmeans_model.labels_[i] for i in test_indices]) # Evaluate the clustering validation performance using the ground-truth labels validation_acc = cluster_acc(validation_target, validation_cluster_assign) print("Validation ACC", validation_acc) validation_ari = adjusted_rand_score(validation_target, validation_cluster_assign) print("Validation ARI", validation_ari) validation_nmi = normalized_mutual_info_score( validation_target, validation_cluster_assign) print("Validation NMI", validation_nmi) # Evaluate the clustering test performance using the ground-truth labels test_acc = cluster_acc(test_target, test_cluster_assign) print("Test ACC", test_acc) test_ari = adjusted_rand_score(test_target, test_cluster_assign) print("Test ARI", test_ari) test_nmi = normalized_mutual_info_score(test_target, test_cluster_assign)
X_train = X_train.reshape((X_train.shape[0], -1)) ae = AutoEncoder(AE_NET, EMBEDDING_SIZE, SEED) dkmeans = DeepKMeans(ae, K, seed=SEED) kmeans = KMeans(n_clusters=K, init="k-means++", random_state=SEED) logdir = "logs/" file_writer = tf.summary.create_file_writer(logdir, flush_millis=10000) file_writer.set_as_default() dkmeans.fit(X_train, BATCH_SIZE, PRETRAIN_EPOCHS, FINETUNE_EPOCH, UPDATE_EPOCH, LEARNING_RATE, LEARNING_RATE, seed=SEED, verbose=True) kmeans.fit(X_train) cls_dkm, _ = dkmeans(X_train) cls_km = kmeans.predict(X_train) print("K-means") print(" ACC: ", cluster_acc(y_train, cls_km)) print(" NMI: ", normalized_mutual_info_score(y_train, cls_km)) print("Deep K-means") print(" ACC: ", cluster_acc(y_train, cls_dkm.numpy())) print(" NMI: ", normalized_mutual_info_score(y_train, cls_dkm.numpy()))
def _evaluate(pred_batches, labels): preds = np.hstack(pred_batches) truths = labels[:preds.size] acc, _ = cluster_acc(preds, truths) nmi = adjusted_mutual_info_score(truths, labels_pred=preds) return acc, nmi
def fit(self, feat, labels=None): feat = feat.astype(np.float32) batch_size = self.batch_size data_size = feat.shape[0] count = {i: 0 for i in range(self.n_clusters)} hidden_feat = self.get_hidden_features(feat, self.net, self.hidden_dim, batch_size=self.batch_size, use_cuda=self.use_cuda) idx, centers = self.init_cluster(hidden_feat, n_clusters=self.n_clusters) last_pred = idx[:] if labels is not None: acc = cluster_acc(labels, idx) print('KMeans pretraining acc is {}'.format(acc)) # optimizer = optim.Adam(self.net.parameters(), lr=self.lr) # optimizer = optim.ASGD(self.net.parameters(), lr=self.lr) optimizer = optim.SGD(self.net.parameters(), lr=self.lr, momentum=0.9) for epoch in range(self.max_epochs): if False: if epoch < 10: count = {i: 50 for i in range(self.n_clusters)} for index in range(0, data_size, batch_size): feat_batch = Variable( torch.from_numpy(feat[index:index + batch_size])) idx_batch = idx[index:index + batch_size] centers_batch = Variable(torch.from_numpy(centers[idx_batch])) if self.use_cuda: feat_batch = feat_batch.cuda() centers_batch = centers_batch.cuda() optimizer.zero_grad() hidden_batch, output_batch = self.net(feat_batch) recons_loss = F.mse_loss(output_batch, feat_batch) cluster_loss = F.mse_loss(hidden_batch, centers_batch) loss = self.recons_lam * recons_loss + self.cluster_lam * cluster_loss loss.backward() optimizer.step() hidden_batch2, _ = self.net(feat_batch) hidden_batch2 = hidden_batch2.cpu().data.numpy() tmp_idx_batch, centers, count = self.batch_km( hidden_batch2, centers, count) idx[index:index + batch_size] = tmp_idx_batch hidden_feat = self.get_hidden_features(feat, self.net, self.hidden_dim, batch_size=self.batch_size, use_cuda=self.use_cuda) idx, centers = self.init_cluster(hidden_feat, n_clusters=self.n_clusters, init_centers=centers) acc = None if labels is not None: acc = cluster_acc(labels, idx) if self.verbose: print('Epoch {} end, current acc is {}'.format(epoch + 1, acc)) if self.whether_convergence(last_pred, idx, self.tol): print('End Iter') break else: last_pred = idx[:] self.centenrs = centers
def fit(self, feat, seeds_dict, labels=None): assert len(seeds_dict) <= self.n_clusters feat = feat.astype(np.float32) batch_size = self.batch_size data_size = feat.shape[0] count = {i: 0 for i in range(self.n_clusters)} seed_masks = self.get_mask(seeds_dict, data_size) seed_labels = self.get_seed_labels(seeds_dict, data_size) hidden_feat = self.get_hidden_features(feat, self.net, self.hidden_dim, batch_size=self.batch_size, use_cuda=self.use_cuda) if True: seed_centers = self.get_seed_centers(n_clusters, seeds_dict, hidden_feat) else: seed_centers = None # idx, centers = self.init_cluster(hidden_feat, n_clusters=self.n_clusters) idx, centers = self.init_cluster(hidden_feat, n_clusters=self.n_clusters, init_centers=seed_centers) last_pred = idx[:] if labels is not None: acc = cluster_acc(labels, idx) print('KMeans pretraining acc is {}'.format(acc)) for i in range(data_size): if seed_masks[i] == 1: idx[i] = seed_labels[i] if False: # align tmp_seed_labels = seed_labels[seed_masks.astype(np.bool)] tmp_idx = np.array(idx)[seed_masks.astype(np.bool)] tmp_mapping = align_labels(tmp_seed_labels, tmp_idx) tmp_idx = [tmp_mapping[i] for i in idx] tmp_range = [tmp_mapping[i] for i in range(self.n_clusters)] tmp_centers = centers[np.array(tmp_range)] centers = tmp_centers idx = tmp_idx if labels is not None: idx = np.array(idx) print(idx.size) print(labels.size) acc = cluster_acc(labels, idx) print('KMeans pretraining acc is {}'.format(acc)) ###########################3 # optimizer = optim.Adam(self.net.parameters(), lr=self.lr) # optimizer = optim.ASGD(self.net.parameters(), lr=self.lr) optimizer = optim.SGD(self.net.parameters(), lr=self.lr, momentum=0.9) for epoch in range(self.max_epochs): for index in range(0, data_size, batch_size): feat_batch = Variable(torch.from_numpy(feat[index: index+batch_size])) idx_batch = idx[index: index+batch_size] mask_batch = Variable(torch.from_numpy(seed_masks[index: index+batch_size])) seeds_labels_batch = seed_labels[index: index+batch_size] centers_batch = Variable(torch.from_numpy(centers[idx_batch])) seeds_centers_batch = Variable(torch.from_numpy(centers[seeds_labels_batch])) if self.use_cuda: feat_batch = feat_batch.cuda() centers_batch = centers_batch.cuda() mask_batch = mask_batch.cuda() seeds_centers_batch = seeds_centers_batch.cuda() optimizer.zero_grad() hidden_batch, output_batch = self.net(feat_batch) recons_loss = F.mse_loss(output_batch, feat_batch) cluster_loss = F.mse_loss(hidden_batch, centers_batch) seed_loss = torch.mean(mask_batch * torch.norm(hidden_batch - seeds_centers_batch, p=2, dim=1)) # loss = self.recons_lam * recons_loss + self.cluster_lam * cluster_loss + seed_loss loss = self.recons_lam * recons_loss + self.cluster_lam * cluster_loss loss.backward() optimizer.step() hidden_batch2, _ = self.net(feat_batch) hidden_batch2 = hidden_batch2.cpu().data.numpy() # tmp_idx_batch, centers, count = self.batch_km(hidden_batch2, centers, count) tmp_idx_batch, centers, count = self.batch_km_seed(hidden_batch2, centers, count, mask_batch.cpu().data.numpy(), seeds_labels_batch) idx[index: index+batch_size] = tmp_idx_batch hidden_feat = self.get_hidden_features(feat, self.net, self.hidden_dim, batch_size=self.batch_size, use_cuda=self.use_cuda) idx, centers = self.init_cluster(hidden_feat, n_clusters=self.n_clusters, init_centers=centers) acc = None if labels is not None: acc = cluster_acc(labels, idx) if self.verbose: print('Epoch {} end, current acc is {}'.format(epoch + 1, acc)) if self.whether_convergence(last_pred, idx, self.tol): print('End Iter') break else: last_pred = idx[:] self.centenrs = centers
def update_cluster_acc(self): self.current_cluster_acc = cluster_acc( np.array(self.corpus_loader.train_labels), self.current_pred_labels)
def cluster(self, args, x_data, y_data=None, test="train", tol=0.01, iter_max=1e6, **kwargs): save_path = os.path.join(args.save_weight_path, "dec_weights_{}.h5".format(args.dataset)) if os.path.isfile(save_path): self.dec_model.load_weights(save_path) print('Restored Model weight') if test == "test": y_pred = self.dec_model.predict(x_data, verbose=0).argmax(1) acc = utils.cluster_acc(y_data, y_pred) print('Accuracy ' + str(np.round(acc, 5))) return update_interval = x_data.shape[0] / self.batch_size print('Update interval', update_interval) train = True iteration, index = 0, 0 current_acc = 0 self.accuracy = 0 while train: sys.stdout.write('\r') # cut off iteration if iter_max < iteration: print('Reached maximum iteration limit. Stopping training.') return self.y_pred # update (or initialize) probability distributions and propagate weight changes # from DEC model to encoder. if iteration % update_interval == 0: self.q = self.dec_model.predict(x_data, verbose=0) self.p = self.p_mat(self.q) y_pred = self.q.argmax(1) delta_label = (np.sum( (y_pred == self.y_pred)).astype(np.float32) / y_pred.shape[0]) if y_data is not None: current_acc = utils.cluster_acc(y_data, y_pred) print('Iteration ' + str(iteration) + ', Accuracy ' + str(np.round(current_acc, 5))) else: print( str(np.round(delta_label * 100, 5)) + '% change in label assignment') if delta_label < tol: print('Reached tolerance threshold.') train = False continue else: self.y_pred = y_pred # weight changes if current if self.accuracy < current_acc: for i in range(len(self.encoder.layers)): self.encoder.layers[i].set_weights( self.dec_model.layers[0].layers[i].get_weights()) self.cluster_centroid = self.dec_model.layers[ -1].get_weights()[0] # save checkpoint self.dec_model.save(save_path) self.accuracy = current_acc print("update weight and save checkpoint") # train on batch sys.stdout.write('Iteration %d, ' % iteration) if (index + 1) * self.batch_size > x_data.shape[0]: loss = self.dec_model.train_on_batch( x_data[index * self.batch_size::], self.p[index * self.batch_size::]) index = 0 sys.stdout.write('Loss %f' % loss) else: loss = self.dec_model.train_on_batch( x_data[index * self.batch_size:(index + 1) * self.batch_size], self.p[index * self.batch_size:(index + 1) * self.batch_size]) sys.stdout.write('Loss %f' % loss) index += 1 iteration += 1 sys.stdout.flush() return
text_idec_model = Text_IDEC(root_dir=root_dir + '/tfidf_i', update_interval=10, n_clusters=n_clusters, use_tensorboard=True, use_vat=False, id=4, semi_supervised=False, split_sents=True, use_ae=use_ae, fd_hidden_dim=cfg.HIDDEN_DIMS[-1]) text_idec_model.clustering() print('Total acc is {}'.format( text_idec_model.current_cluster_acc)) pred = np.array(text_idec_model.current_pred_labels) labels = np.array(text_idec_model.corpus_loader.train_labels) acc = cluster_acc(labels, pred) nmi = normalized_mutual_info_score(labels, pred) ari = adjusted_mutual_info_score(labels, pred) all_pred.append(pred.tolist()) all_acc.append(acc) all_nmi.append(nmi) all_ari.append(ari) if acc > best_acc: best_pred = pred best_acc = acc print('{} best acc is {}'.format(feat_name, best_acc)) pred_std = np.std(all_acc) pred_mean = np.mean(all_acc) dump_mongo(corpora=corpora_name, feat_name=feat_name, n_topics=n_clusters,