def compute_acc_and_nmi_conflicted_data(self, x, y, centers_emb, beta1, beta2): features = self.predict_encoder(x) unconf_indices, conf_indices = self.generate_unconflicted_data_index(x, centers_emb, beta1, beta2) if unconf_indices.size == 0: print(' '*8 + "Empty list of unconflicted data") acc_unconf = 0 nmi_unconf = 0 else: x_emb_unconf = self.predict_encoder(x[unconf_indices]) y_unconf = y[unconf_indices] y_pred_unconf = q_mat(x_emb_unconf, centers_emb, alpha=1.0).argmax(axis=1) acc_unconf = metrics.acc(y_unconf, y_pred_unconf) nmi_unconf = metrics.nmi(y_unconf, y_pred_unconf) print(' '*8 + '|==> acc unconflicted data: %.4f, nmi unconflicted data: %.4f <==|'% (acc_unconf, nmi_unconf)) if conf_indices.size == 0: print(' '*8 + "Empty list of conflicted data") acc_conf = 0 nmi_conf = 0 else: x_emb_conf = self.predict_encoder(x[conf_indices]) y_conf = y[conf_indices] y_pred_conf = q_mat(x_emb_conf, centers_emb, alpha=1.0).argmax(axis=1) acc_conf = metrics.acc(y_conf, y_pred_conf) nmi_conf = metrics.nmi(y_conf, y_pred_conf) print(' '*8 + '|==> acc conflicted data: %.4f, nmi conflicted data: %.4f <==|'% (metrics.acc(y_conf, y_pred_conf), metrics.nmi(y_conf, y_pred_conf))) return acc_unconf, nmi_unconf, acc_conf, nmi_conf
def metric(self, y, y_pred): acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) print('acc:', acc) print('nmi:', nmi) print('ari:', ari)
def match(y,cl): cl=np.array(cl) y=np.array(y) acc = np.round(metrics.acc(y, cl), 5) nmi = np.round(metrics.nmi(y, cl), 5) ari = np.round(metrics.ari(y, cl), 5) return acc,nmi,ari
def epochBegin(self, epoch): if epoch % self.decay_n == 0 and epoch != 0: self.lr_decay() gamma = self.gamma_output.predict(self.inputs, batch_size=batch_size) pred = np.argmax(gamma, axis=1) acc = self.cluster_acc(pred, self.Y) Y = np.reshape(self.Y, [self.Y.shape[0]]) nmi = metrics.nmi(Y, pred) ari = metrics.ari(Y, pred) purity = self.purity_score(Y, pred) global accuracy accuracy = [] accuracy += [acc[0]] if epoch > 0: print('ACC:%0.8f' % acc[0]) print('NMI:', nmi) print('ARI:', ari) print('Purity', purity) if epoch == 1 and dataset == 'har' and acc[0] < 0.77: print( '=========== HAR dataset:bad init!Please run again! ============' ) sys.exit(0)
def fit(self, x, y=None, maxiter=2e4, batch_size=256, tol=1e-3, update_interval=140, save_dir='./results/temp', rand_seed=None): print('Update interval', update_interval) save_interval = int(x.shape[0] / batch_size) * 5 # 5 epochs print('Save interval', save_interval) # Step 1: initialize cluster centers using k-means print('Initializing cluster centers with k-means.') kmeans = KMeans(n_clusters=self.n_clusters, n_init=100) y_pred = kmeans.fit_predict(self.encoder.predict(x)) y_pred_last = np.copy(y_pred) self.model.get_layer(name='clustering').set_weights( [kmeans.cluster_centers_]) loss = 0 index = 0 index_array = np.arange(x.shape[0]) for ite in range(int(maxiter)): if ite % update_interval == 0: q = self.model.predict(x, verbose=0) p = self.target_distribution(q) y_pred = q.argmax(1) if y is not None: acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) loss = np.round(loss, 5) print('Iter %d: acc = %.5f, nmi = %.5f' % (ite, acc, nmi), ' ; loss=', loss) # check stop criterion delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] y_pred_last = np.copy(y_pred) if ite > 0 and delta_label < tol: print('delta_label ', delta_label, '< tol ', tol) print('Reached tolerance threshold. Stopping training.') break idx = index_array[index * batch_size:min((index + 1) * batch_size, x.shape[0])] loss = self.model.train_on_batch(x=x[idx], y=p[idx]) index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0 ite += 1 # save the trained model print('saving model to:', save_dir + 'STC_model_final.h5') self.model.save_weights(save_dir + 'STC_model_final.h5') return y_pred
def train(args): # get data and model (x, y), model = _get_data_and_model(args) # split train validation data if y is None: x_train, x_val = train_test_split(x, test_size=0.1) y_val = None y_train = None else: x_train, x_val, y_train, y_val = train_test_split(x, y, stratify=y, test_size=0.1) model.model.summary() # pretraining t0 = time() if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.pretrained_weights is not None and os.path.exists( args.pretrained_weights): # load pretrained weights model.autoencoder.load_weights(args.pretrained_weights) else: # train pretrain_optimizer = SGD(1.0, 0.9) if args.method in [ 'FcDEC', 'FcIDEC', 'FcDEC-DA', 'FcIDEC-DA' ] else 'adam' model.pretrain(x_train, y_train, x_val, y_val, optimizer=pretrain_optimizer, epochs=args.pretrain_epochs, batch_size=args.batch_size, save_dir=args.save_dir, verbose=args.verbose, aug_pretrain=args.aug_pretrain) t1 = time() print("Time for pretraining: %ds" % (t1 - t0)) # clustering y_pred = model.fit(x, y, maxiter=args.maxiter, batch_size=args.batch_size, update_interval=args.update_interval, save_dir=args.save_dir, aug_cluster=args.aug_cluster) if y is not None: print('Final: acc=%.4f, nmi=%.4f, ari=%.4f' % (metrics.acc( y, y_pred), metrics.nmi(y, y_pred), metrics.ari(y, y_pred))) t2 = time() print("Time for pretaining, clustering and total: (%ds, %ds, %ds)" % (t1 - t0, t2 - t1, t2 - t0)) print('=' * 60)
def on_epoch_end(self, epoch, logs=None): if int(epochs / 10) != 0 and epoch % int(epochs / 10) != 0: return feature_model = tf.keras.models.Model(self.model.input, self.model.get_layer('encoder_3').output) features = feature_model.predict(self.x) km = KMeans(n_clusters=len(np.unique(self.y)), n_init=20, n_jobs=4) y_pred = km.fit_predict(features) # print() print(' ' * 8 + '|==> acc: %.4f, nmi: %.4f <==|' % (metrics.acc(self.y, y_pred), metrics.nmi(self.y, y_pred)))
def test(args): assert args.weights is not None (x, y), model = _get_data_and_model(args) model.model.summary() print('Begin testing:', '-' * 60) model.load_weights(args.weights) y_pred = model.predict_labels(x) print('acc=%.4f, nmi=%.4f, ari=%.4f' % (metrics.acc( y, y_pred), metrics.nmi(y, y_pred), metrics.ari(y, y_pred))) print('End testing:', '-' * 60)
def kmeans_(): # use features for clustering from sklearn.cluster import KMeans km = KMeans(n_clusters=N, init='k-means++') #features = np.reshape(x_train, newshape=(features.shape[0], -1)) km_trans = km.fit_transform(x_train) pred = km.predict(x_train) print pred.shape print('acc=', met.acc(y_train, pred), 'nmi=', met.nmi(y_train, pred), 'ari=', met.ari(y_train, pred)) return km_trans, pred
def fit(self, x, y=None, save_dir='./results/temp'): # print('Begin training:', '-' * 60) t1 = time() print( '******************** Use Denpeak to Cluster ************************' ) features = self.encoder.predict(x) print("features shape:", features.shape) features = TSNE(n_components=2).fit_transform(features) # np.savetxt("features.txt", features) print("features shape:", features.shape) y_pred, y_border, center_num, dc_percent, dc = DenPeakCluster(features) print('saving picture to:', save_dir + '/2D.png') plt.cla() plt.scatter(features[:, 0], features[:, 1], c=y_pred, s=0.5, alpha=0.5) plt.savefig(save_dir + '/2D.png') np.savetxt(save_dir + '/dc_coeff.txt', [dc_percent, dc]) # logging file import csv, os if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = open(save_dir + '/log.csv', 'w') logwriter = csv.DictWriter( logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss', 'center_num']) logwriter.writeheader() acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) # if acc>=0.95: np.savetxt(save_dir + '/features.txt', features) np.savetxt(save_dir + '/labels.txt', y_pred) np.savetxt(save_dir + '/border.txt', y_border) from Draw_border import draw draw(save_dir) logdict = dict(iter=0, acc=acc, nmi=nmi, ari=ari, center_num=center_num) logwriter.writerow(logdict) logfile.flush() print('Iter %d: acc=%.5f, nmi=%.5f, ari=%.5f; center_num=%d' % (0, acc, nmi, ari, center_num)) logfile.close() return y_pred
def on_epoch_end(self, epoch, logs=None): if int(epochs / 10) != 0 and epoch % int(epochs / 10) != 0: return feature_model = Model( self.model.input, self.model.get_layer( 'encoder_%d' % (int(len(self.model.layers) / 2) - 1)).output) features = feature_model.predict(self.x) km = KMeans(n_clusters=nclusters, n_init=20) y_pred = km.fit_predict(features) if self.y: acc, nmi = metrics.acc(self.y, y_pred), metrics.nmi( self.y, y_pred) print(' ' * 8 + '|==> acc: %.4f, nmi: %.4f <==|' % (acc, nmi)) else: if not self.lastpred is None: nmi = metrics.nmi(self.lastpred, y_pred) print(' ' * 8 + '|==> nmi: %.4f <==|' % (nmi)) self.lastpred = y_pred
def train_feature(net1, train_data): map_dict = read_pkl() if torch.cuda.is_available(): net1 = torch.nn.DataParallel(net1, device_ids=[0]) net1 = net1.cuda() prev_time = datetime.now() for i_dir in range(classnum): if not os.path.isdir('./data/' + str(i_dir)): os.makedirs('./data/' + str(i_dir)) label_np = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0] * 10).reshape(10, 10) # label_np = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] * 20).reshape(20, 20) label2 = [] idx2 = [] for im, label in tqdm(train_data, desc="Processing train data: "): im = im.cuda() feat = net1(im) for i in range(feat.size(0)): distance_list = list() for ui_50D_label in map_dict.values(): distance = sum(sum((ui_50D_label.float().cuda() - feat[i])**2)) distance_list.append(distance.item()) idx = distance_list.index(min(distance_list)) save_image( inver_transform2(im[i]), './data/' + str(idx) + '/' + str(random.randint(1, 10000000)) + '.png') label_np[idx][label[i].item()] += 1 label2.append(idx) label1 = label.numpy() # for _,i in enumerate(label): # idx2.append(i) for i in label1: idx2.append(i) t2 = np.array(idx2) t1 = np.array(label2) # print(t2.shape) # t2 = t2.reshape([t1.size,-1]).squeeze(0) print('acc=%.4f, nmi=%.4f, ari=%.4f' % (metrics.acc(t1, t2), metrics.nmi(t1, t2), metrics.ari(t1, t2))) corr_num = 0 for item in label_np: corr_num += item.max() corr = corr_num / label_np.sum() print(corr) np.save('./model/MNIST/feature/' + str(feat.size(1)) + '_' + '.npy', label_np)
def gmm_kmeans_cluster(self, dataloader): use_cuda = torch.cuda.is_available() if use_cuda: self.cuda() self.eval() data = [] Y = [] for batch_idx, (inputs, y) in enumerate(dataloader): inputs = inputs.view(inputs.size(0), -1).float() if use_cuda: inputs = inputs.cuda() inputs = Variable(inputs) _, _, _, mu, _ = self.forward(inputs) data.append(mu.data.cpu().numpy()) Y.append(y.numpy()) data = np.concatenate(data) Y = np.concatenate(Y) gmm = GaussianMixture(n_components=self.n_centroids, covariance_type='full') gmm.fit(data) y_pred_gmm = gmm.predict(data) acc = np.round(metrics.acc(Y, y_pred_gmm), 5) nmi = np.round(metrics.nmi(Y, y_pred_gmm), 5) ari = np.round(metrics.ari(Y, y_pred_gmm), 5) print( 'GMM fit of AutoEncoder embedding: acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari)) km = KMeans(n_clusters=self.n_centroids, n_init=20) y_pred_kmeans = km.fit_predict(data) acc = np.round(metrics.acc(Y, y_pred_kmeans), 5) nmi = np.round(metrics.nmi(Y, y_pred_kmeans), 5) ari = np.round(metrics.ari(Y, y_pred_kmeans), 5) print( 'Kmeans clustering of AutoEncoder embedding: acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari))
def on_epoch_end(self, epoch, logs=None): if epochs < 10 or epoch % int(epochs / 10) != 0: return feature_model = Model( self.model.input, self.model.get_layer( index=int(len(self.model.layers) / 2)).output) features = feature_model.predict(self.x) km = KMeans(n_clusters=len(np.unique(self.y)), n_init=20, n_jobs=4) y_pred = km.fit_predict(features) print(' ' * 8 + '|==> acc: %.4f, nmi: %.4f <==|' % (metrics.acc(self.y, y_pred), metrics.nmi(self.y, y_pred)))
def main(): x = np.load('./data/chan/all/VGG/featuresx.npy') init = 'glorot_uniform' # prepare the DEC model silhouette_avgs = [] nims = [] rel_loss = [] prev = None for n_clusters in [5, 10, 15, 20]: weights = './results/models/chan/all/VGG/DEC_model_final_%s.h5' % n_clusters dec = DEC(dims=[x.shape[-1], 500, 500, 2000, 10], n_clusters=n_clusters, init=init) dec.model.load_weights(weights) q = dec.model.predict(x, verbose=0) y_pred = q.argmax(1) if not prev is None: nmi_ = np.round(metrics.nmi(prev, y_pred), 5) nims.append((nmi_)) print( '\n |==> NMI against previous assignment: {0:.3f} <==|'.format( nmi_)) prev = y_pred silhouette_avg = silhouette_score(x, y_pred) silhouette_avgs.append(silhouette_avg) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) '''tr_loss = dec.model.evaluate(x) ts_loss = dec.model.evaluate(x_test) rel_loss.append(tr_loss/ts_loss) print('\n |==> relative loss: {0:.4f} <==|'.format(tr_loss/ts_loss))''' plt.plot(range(len(nims)), nims) plt.show() plt.plot(range(len(silhouette_avgs)), silhouette_avgs) plt.show()
def test(net1, test_data): # if torch.cuda.is_available(): net1 = torch.nn.DataParallel(net1, device_ids=[0]) net1 = net1.cuda() # label2 = [] idx2 = [] for im, label in tqdm(test_data, desc="Processing train data: "): im = im.cuda() _, feat = net1(im) for i in range(feat.size(0)): distance = feat[i].cpu().numpy().tolist() idx = distance.index(max(distance)) label2.append(idx) label1 = label.numpy() for i in label1: idx2.append(i) t2 = np.array(idx2) t1 = np.array(label2) return metrics.acc(t2, t1), metrics.nmi(t2, t1)
def train_feature(net1, train_data): # if torch.cuda.is_available(): net1 = torch.nn.DataParallel(net1, device_ids=[0]) net1 = net1.cuda() # for i_dir in range(classnum): if not os.path.isdir('./data/' + str(i_dir)): os.makedirs('./data/' + str(i_dir)) label_np = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0] * 10).reshape(10, 10) # label2 = [] idx2 = [] for im, label in tqdm(train_data, desc="Processing train data: "): # print(label) im = im.cuda() _, feat = net1(im) for i in range(feat.size(0)): distance = feat[i].cpu().numpy().tolist() idx = distance.index(max(distance)) save_image( inver_transform2(im[i]), './data/' + str(idx) + '/' + str(random.randint(1, 10000000)) + '.png') # MATRIX label_np[idx][label[i].item()] += 1 # label2.append(idx) label1 = label.numpy() for i in label1: idx2.append(i) t2 = np.array(idx2) t1 = np.array(label2) print('acc=%.4f, nmi=%.4f, ari=%.4f' % (metrics.acc(t1, t2), metrics.nmi(t1, t2), metrics.ari(t1, t2))) ############################## np.save(File + str(feat.size(1)) + '_' + '.npy', label_np)
def on_epoch_end( self, epoch, logs=None): #called at the end of every epoch? if int(epochs / 10) != 0 and epoch % int( epochs / 10) != 0: # 只在epochs的10%的迭代次数之内运行以下的print 代码 return feature_model = Model( self.model.input, self.model.get_layer( 'encoder_%d' % (int(len(self.model.layers) / 2) - 1)).output) features = feature_model.predict( self.x ) #pretrain训练的是自编码器部分,这里是encoder的输出,在embedding后的向量空间上进行KMEANS可以观察encoding的效果? km = KMeans(n_clusters=len(np.unique(self.y)), n_init=20, n_jobs=4) y_pred = km.fit_predict(features) # print() print(' ' * 8 + '|==> acc: %.4f, nmi: %.4f <==|' % (metrics.acc(self.y, y_pred), metrics.nmi(self.y, y_pred)))
def run_net(data, params): # # UNPACK DATA # x_train_unlabeled, y_train_unlabeled, x_val, y_val, x_test, y_test = data[ 'spectral']['train_and_test'] print(params['input_shape']) inputs_vae = Input(shape=params['input_shape'], name='inputs_vae') ConvAE = Conv.ConvAE(inputs_vae, params) try: ConvAE.vae.load_weights('vae_mnist.h5') except OSError: print('No pretrained weights available...') lh = LearningHandler(lr=params['spec_lr'], drop=params['spec_drop'], lr_tensor=ConvAE.learning_rate, patience=params['spec_patience']) lh.on_train_begin() n_epochs = 5000 losses_vae = np.empty((n_epochs, )) homo_plot = np.empty((n_epochs, )) nmi_plot = np.empty((n_epochs, )) ari_plot = np.empty((n_epochs, )) y_val = np.squeeze(np.asarray(y_val).ravel()) # squeeze into 1D array start_time = time.time() for i in range(n_epochs): # if i==0: x_recon, _, x_val_y = ConvAE.vae.predict(x_val) losses_vae[i] = ConvAE.train_vae(x_val, x_val_y, params['batch_size']) #x_val_y = ConvAE.vae.predict(x_val)[2] #y_sp = x_val_y.argmax(axis=1) #print_accuracy(y_sp, y_val, params['n_clusters']) print("Epoch: {}, loss={:2f}".format(i, losses_vae[i])) os.makedirs('vae', exist_ok=True) os.makedirs('vae_umap', exist_ok=True) fig, axs = plt.subplots(3, 4, figsize=(25, 18)) fig.subplots_adjust(wspace=0.25) embedding = ConvAE.encoder.predict(x_val) kmeans = KMeans(n_clusters=params['n_clusters'], n_init=30) predicted_labels = kmeans.fit_predict( embedding) # cluster on current embeddings for metric eval _, confusion_matrix = get_y_preds(predicted_labels, y_val, params['n_clusters']) homo_plot[i] = metrics.acc(y_val, predicted_labels) nmi_plot[i] = metrics.nmi(y_val, predicted_labels) ari_plot[i] = metrics.ari(y_val, predicted_labels) tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) Z_tsne = tsne.fit_transform(embedding) sc = axs[1][0].scatter(Z_tsne[:, 0], Z_tsne[:, 1], s=2, c=y_train_unlabeled, cmap=plt.cm.get_cmap("jet", 14)) axs[1][0].set_title('t-SNE Embeddings') axs[1][0].set_xlabel('t-SNE 1') axs[1][0].set_ylabel('t-SNE 2') axs[1][0].set_xticks([]) axs[1][0].set_yticks([]) axs[1][0].spines['right'].set_visible(False) axs[1][0].spines['top'].set_visible(False) divider = make_axes_locatable(axs[1][0]) cax = divider.append_axes('right', size='15%', pad=0.05) cbar = fig.colorbar(sc, cax=cax, orientation='vertical', ticks=range(params['n_clusters'])) cbar.ax.set_yticklabels( params['cluster_names']) # vertically oriented colorbar # Create offset transform by 5 points in x direction dx = 0 / 72. dy = -5 / 72. offset = matplotlib.transforms.ScaledTranslation( dx, dy, fig.dpi_scale_trans) # apply offset transform to all cluster ticklabels. for label in cbar.ax.yaxis.get_majorticklabels(): label.set_transform(label.get_transform() + offset) reducer = umap.UMAP(transform_seed=36, random_state=36) matrix_reduce = reducer.fit_transform(embedding) sc = axs[1][1].scatter(matrix_reduce[:, 0], matrix_reduce[:, 1], s=2, c=y_train_unlabeled, cmap=plt.cm.get_cmap("jet", 14)) axs[1][1].set_title('UMAP Embeddings') axs[1][1].set_xlabel('UMAP 1') axs[1][1].set_ylabel('UMAP 2') axs[1][1].set_xticks([]) axs[1][1].set_yticks([]) # Hide the right and top spines axs[1][1].spines['right'].set_visible(False) axs[1][1].spines['top'].set_visible(False) im = axs[1][2].imshow(confusion_matrix, cmap='YlOrRd') axs[1][2].set_title('Confusion Matrix') axs[1][2].set_xticks(range(params['n_clusters'])) axs[1][2].set_yticks(range(params['n_clusters'])) axs[1][2].set_xticklabels(params['cluster_names'], fontsize=8) axs[1][2].set_yticklabels(params['cluster_names'], fontsize=8) divider = make_axes_locatable(axs[1][2]) cax = divider.append_axes('right', size='10%', pad=0.05) cbar = fig.colorbar(im, cax=cax, orientation='vertical', ticks=[]) axs[0][0].plot(losses_vae[:i + 1]) axs[0][0].set_title('VAE Loss') axs[0][0].set_xlabel('epochs') axs[0][1].plot(homo_plot[:i + 1]) axs[0][1].set_title('Homogeneity') axs[0][1].set_xlabel('epochs') axs[0][1].set_ylim(0, 1) axs[0][2].plot(ari_plot[:i + 1]) axs[0][2].set_title('ARI') axs[0][2].set_xlabel('epochs') axs[0][2].set_ylim(0, 1) axs[0][3].plot(nmi_plot[:i + 1]) axs[0][3].set_title('NMI') axs[0][3].set_xlabel('epochs') axs[0][3].set_ylim(0, 1) #reconstructed_cell = ConvAE.vae.predict(x_val[:1, ...])[0, ..., 0] cell_tile = x_val[0, ..., 0] cell_tile = cell_tile[:, :64] x_recon = x_recon[0, ..., 0] reconstructed_cell_tile = x_recon[:, :64] reconstructed_cell_tile = np.flipud(reconstructed_cell_tile) cell_heatmap = np.vstack((cell_tile, reconstructed_cell_tile)) axs[1][3].imshow(cell_heatmap, cmap='Reds') axs[1][3].set_xticks([]) axs[1][3].set_yticks([]) axs[1][3].spines['right'].set_visible(False) axs[1][3].spines['top'].set_visible(False) axs[1][3].spines['left'].set_visible(False) axs[1][3].spines['bottom'].set_visible(False) # get eigenvalues and eigenvectors scale = get_scale(embedding, params['batch_size'], params['scale_nbr']) values, vectors = spectral_clustering(embedding, scale, params['n_nbrs'], params['affinity']) # sort, then store the top n_clusters=2 values_idx = np.argsort(values) x_spectral_clustering = vectors[:, values_idx[:params['n_clusters']]] # do kmeans clustering in this subspace y_spectral_clustering = KMeans( n_clusters=params['n_clusters']).fit_predict( vectors[:, values_idx[:params['n_clusters']]]) tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) Z_tsne = tsne.fit_transform(x_spectral_clustering) sc = axs[2][0].scatter(Z_tsne[:, 0], Z_tsne[:, 1], s=2, c=y_train_unlabeled, cmap=plt.cm.get_cmap("jet", 14)) axs[2][0].set_title('Spectral Clusters (t-SNE) True Labels') axs[2][0].set_xlabel('t-SNE 1') axs[2][0].set_ylabel('t-SNE 2') axs[2][0].set_xticks([]) axs[2][0].set_yticks([]) axs[2][0].spines['right'].set_visible(False) axs[2][0].spines['top'].set_visible(False) reducer = umap.UMAP(transform_seed=36, random_state=36) matrix_reduce = reducer.fit_transform(x_spectral_clustering) axs[2][1].scatter(matrix_reduce[:, 0], matrix_reduce[:, 1], s=2, c=y_spectral_clustering, cmap=plt.cm.get_cmap("jet", 14)) axs[2][1].set_title('Spectral Clusters (UMAP)') axs[2][1].set_xlabel('UMAP 1') axs[2][1].set_ylabel('UMAP 2') axs[2][1].set_xticks([]) axs[2][1].set_yticks([]) # Hide the right and top spines axs[2][1].spines['right'].set_visible(False) axs[2][1].spines['top'].set_visible(False) axs[2][2].scatter(matrix_reduce[:, 0], matrix_reduce[:, 1], s=2, c=y_train_unlabeled, cmap=plt.cm.get_cmap("jet", 14)) axs[2][2].set_title('True Labels (UMAP)') axs[2][2].set_xlabel('UMAP 1') axs[2][2].set_ylabel('UMAP 2') axs[2][2].set_xticks([]) axs[2][2].set_yticks([]) # Hide the right and top spines axs[2][2].spines['right'].set_visible(False) axs[2][2].spines['top'].set_visible(False) axs[2][3].hist(x_spectral_clustering) axs[2][3].set_title("histogram of true eigenvectors") train_time = str( datetime.timedelta(seconds=(int(time.time() - start_time)))) n_matrices = (i + 1) * params['batch_size'] * 100 fig.suptitle('Trained on ' + '{:,}'.format(n_matrices) + ' cells\n' + train_time) plt.savefig('vae/%d.png' % i) plt.close() plt.close() if i > 1: if np.abs(losses_vae[i] - losses_vae[i - 1]) < 0.0001: print('STOPPING EARLY') break print("finished training") plt.plot(losses_vae) plt.title('VAE Loss') plt.show() x_val_y = ConvAE.vae.predict(x_val)[2] # x_val_y = ConvAE.classfier.predict(x_val_lp) y_sp = x_val_y.argmax(axis=1) print_accuracy(y_sp, y_val, params['n_clusters']) from sklearn.metrics import normalized_mutual_info_score as nmi y_val = np.squeeze(np.asarray(y_val).ravel()) # squeeze into 1D array print(y_sp.shape, y_val.shape) nmi_score1 = nmi(y_sp, y_val) print('NMI: ' + str(np.round(nmi_score1, 4))) embedding = ConvAE.encoder.predict(x_val) tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) Z_tsne = tsne.fit_transform(embedding) fig = plt.figure() plt.scatter(Z_tsne[:, 0], Z_tsne[:, 1], s=2, c=y_train_unlabeled, cmap=plt.cm.get_cmap("jet", 14)) plt.colorbar(ticks=range(params['n_clusters'])) plt.show()
def fit(self, x, y=None, batch_size=256, maxiter=2e4, tol=1e-3, update_interval=140, cae_weights=None, save_dir='./results/temp'): print('Update interval', update_interval) save_interval = x.shape[0] / batch_size * 5 print('Save interval', save_interval) # Step 1: pretrain if necessary t0 = time() if not self.pretrained and cae_weights is None: print('...pretraining CAE using default hyper-parameters:') print(' optimizer=\'adam\'; epochs=200') self.pretrain(x, batch_size, save_dir=save_dir) self.pretrained = True elif cae_weights is not None: self.cae.load_weights(cae_weights) print('cae_weights is loaded successfully.') # Step 2: initialize cluster centers using k-means t1 = time() print('Initializing cluster centers with k-means.') kmeans = KMeans(n_clusters=self.n_clusters, n_init=20) self.y_pred = kmeans.fit_predict(self.encoder.predict(x)) y_pred_last = np.copy(self.y_pred) self.model.get_layer(name='clustering').set_weights( [kmeans.cluster_centers_]) # Step 3: deep clustering # logging file import csv, os if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = open(save_dir + '/dcec_log.csv', 'w') logwriter = csv.DictWriter( logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L', 'Lc', 'Lr']) logwriter.writeheader() t2 = time() loss = [0, 0, 0] index = 0 for ite in range(int(maxiter)): if ite % update_interval == 0: q, _ = self.model.predict(x, verbose=0) p = self.target_distribution( q) # update the auxiliary target distribution p # evaluate the clustering performance self.y_pred = q.argmax(1) if y is not None: acc = np.round(metrics.acc(y, self.y_pred), 5) nmi = np.round(metrics.nmi(y, self.y_pred), 5) ari = np.round(metrics.ari(y, self.y_pred), 5) loss = np.round(loss, 5) logdict = dict(iter=ite, acc=acc, nmi=nmi, ari=ari, L=loss[0], Lc=loss[1], Lr=loss[2]) logwriter.writerow(logdict) print('Iter', ite, ': Acc', acc, ', nmi', nmi, ', ari', ari, '; loss=', loss) # check stop criterion delta_label = np.sum(self.y_pred != y_pred_last).astype( np.float32) / self.y_pred.shape[0] y_pred_last = np.copy(self.y_pred) if ite > 0 and delta_label < tol: print('delta_label ', delta_label, '< tol ', tol) print('Reached tolerance threshold. Stopping training.') logfile.close() break # train on batch if (index + 1) * batch_size > x.shape[0]: loss = self.model.train_on_batch( x=x[index * batch_size::], y=[p[index * batch_size::], x[index * batch_size::]]) index = 0 else: loss = self.model.train_on_batch( x=x[index * batch_size:(index + 1) * batch_size], y=[ p[index * batch_size:(index + 1) * batch_size], x[index * batch_size:(index + 1) * batch_size] ]) index += 1 # save intermediate model if ite % save_interval == 0: # save DCEC model checkpoints print('saving model to:', save_dir + '/dcec_model_' + str(ite) + '.h5') self.model.save_weights(save_dir + '/dcec_model_' + str(ite) + '.h5') ite += 1 # save the trained model logfile.close() print('saving model to:', save_dir + '/dcec_model_final.h5') self.model.save_weights(save_dir + '/dcec_model_final.h5') t3 = time() print('Pretrain time: ', t1 - t0) print('Clustering time:', t3 - t1) print('Total time: ', t3 - t0)
elif args.dataset == 'usps': x, y = load_usps('data/usps') elif args.dataset == 'mnist-test': x, y = load_mnist() x, y = x[60000:], y[60000:] # prepare the DCEC model dcec = DCEC(input_shape=x.shape[1:], filters=[32, 64, 128, 10], n_clusters=args.n_clusters) plot_model(dcec.model, to_file=args.save_dir + '/dcec_model.png', show_shapes=True) dcec.model.summary() # begin clustering. optimizer = 'adam' dcec.compile(loss=['kld', 'mse'], loss_weights=[args.gamma, 1], optimizer=optimizer) dcec.fit(x, y=y, tol=args.tol, maxiter=args.maxiter, update_interval=args.update_interval, save_dir=args.save_dir, cae_weights=args.cae_weights) y_pred = dcec.y_pred print('acc = %.4f, nmi = %.4f, ari = %.4f' % (metrics.acc( y, y_pred), metrics.nmi(y, y_pred), metrics.ari(y, y_pred)))
"p123", "p125", "p128", "p129", "p133", "p135" ] all_validation_image_names = ["p137", "p141", "p143", "p144", "p147"] param_file_names = ["translation", "affine", "parameters_test"] param_array = get_param_array(param_file_names) nr_atlas_images = len(all_training_image_names) for valid_img_name in all_validation_image_names: begin_time = time.time() print(f"{valid_img_name} |", end="\t", flush=True) valid_img_path = f"{VALIDATION_DATA_PATH}/{valid_img_name}/mr_bffe.mhd" valid_img = GetArrayFromImage(ReadImage(valid_img_path)) weights = np.zeros(nr_atlas_images) predictions = np.zeros((nr_atlas_images, 86, 333, 271)) for i, atlas_img_name in enumerate(all_training_image_names): print(f"{atlas_img_name}", end="\t", flush=True) atlas_mr_img_path = f"{TRAINING_DATA_PATH}/{atlas_img_name}/mr_bffe.mhd" atlas_pros_img_path = f"{TRAINING_DATA_PATH}/{atlas_img_name}/prostaat.mhd" transform = get_transform(valid_img_path, atlas_mr_img_path) transformed_atlas_mr_img = get_transformed_image( atlas_mr_img_path, transform) predictions[i] = get_transformed_image(atlas_pros_img_path, transform) weights[i] = metrics.nmi(valid_img, transformed_atlas_mr_img) weights = (weights - np.min(weights))**2 prediction = np.zeros((86, 333, 271)) for i in range(nr_atlas_images): prediction += predictions[i] * weights[i] prediction = (prediction > 0.45 * np.sum(weights)).astype(np.uint8) write_mhd(valid_img_name, prediction) print(time.time() - begin_time)
dec = STC(dims=[x.shape[-1], 500, 500, 2000, 20], n_clusters=n_clusters) # pretrain model #################################################################################### #if not os.path.exists(args.ae_weights): dec.pretrain(x=x, y=None, optimizer='adam', epochs=args.pretrain_epochs, batch_size=args.batch_size, save_dir=args.save_dir) #else: # dec.autoencoder.load_weights(args.ae_weights) dec.model.summary() t0 = time() dec.compile(SGD(0.1, 0.9), loss='kld') # clustering #################################################################################### y_pred = dec.fit(x, y=y, tol=args.tol, maxiter=args.maxiter, batch_size=args.batch_size, update_interval=args.update_interval, save_dir=args.save_dir, rand_seed=0) print('acc:', metrics.acc(y, y_pred)) print('nmi', metrics.nmi(y, y_pred))
def fit(self, x, y=None, batch_size=256, epochs=100, ae_weights=None, save_dir='result/temp', tol=0.001, use_sp=True, da_s2=False): # prepare folder for saving results import csv, os if not os.path.exists(save_dir): os.makedirs(save_dir) # pretraining t0 = time() if ae_weights is None and not self.pretrained: print('Pretraining AE...') self.pretrain(x, save_dir=save_dir) print('Pretraining time: %.1fs' % (time() - t0)) elif ae_weights is not None: self.autoencoder.load_weights(ae_weights) print('Pretrained AE weights are loaded successfully!') # initialization t1 = time() self.y_pred, self.centers = self.basic_clustering(self.predict(x)) t2 = time() print('Time for initialization: %.1fs' % (t2 - t1)) # logging file logfile = open(save_dir + '/log.csv', 'w') logwriter = csv.DictWriter( logfile, fieldnames=['epoch', 'acc', 'nmi', 'Ln', 'Lc']) logwriter.writeheader() best_ACC = 0 ##这里我加了一个best_ACC,这样的话,我们就能进行比较了 net_loss = 0 clustering_loss = 0 time_train = 0 sample_weight = np.ones(shape=x.shape[0]) sample_weight[self.y_pred == -1] = 0 # do not use the noisy examples y_pred_last = np.copy(self.y_pred) result = None for epoch in range(epochs + 1): """ Log and check stopping criterion """ if y is not None: acc = np.round(metrics.acc(y, self.y_pred), 5) nmi = np.round(metrics.nmi(y, self.y_pred), 5) print( 'Epoch-%d: ACC=%.4f, NMI=%.4f, Ln=%.4f, Lc=%.4f; time=%.1f' % (epoch, acc, nmi, net_loss, clustering_loss, time_train)) logwriter.writerow( dict(epoch=epoch, acc=acc, nmi=nmi, Ln=net_loss, Lc=clustering_loss)) logfile.flush() # record the initial result if epoch == 0: print('ASPC model saved to \'%s/model_init.h5\'' % save_dir) self.model.save_weights(save_dir + '/model_init.h5') ##进行比较 if acc > best_ACC: self.model.save_weights(save_dir + '/model_best.h5') best_ACC = acc # check stop criterion delta_y = np.sum(self.y_pred != y_pred_last).astype( np.float32) / self.y_pred.shape[0] y_pred_last = np.copy(self.y_pred) if (epoch > 0 and delta_y < tol) or epoch >= epochs: result = np.asarray([acc, nmi]) print( 'Training stopped: epoch=%d, delta_label=%.4f, tol=%.4f' % (epoch, delta_y, tol)) print('ASPC model saved to \'%s/model_final.h5\'' % save_dir) print('-' * 30 + ' END: time=%.1fs ' % (time() - t0) + '-' * 30) self.model.save_weights(save_dir + '/model_final.h5') logfile.close() break """ Step 1: train the network """ t0_epoch = time() if da_s2: # use data augmentation history = self.model.fit_generator( generator(self.datagen, x, self.centers[self.y_pred], sample_weight, batch_size), steps_per_epoch=math.ceil(x.shape[0] / batch_size), epochs=5 if np.any(self.y_pred == -1) and epoch == 0 else 1, workers=4, verbose=0) else: history = self.model.fit(x, y=self.centers[self.y_pred], batch_size=batch_size, epochs=1, sample_weight=sample_weight, verbose=0) net_loss = history.history['loss'][0] """ Step 2: update labels """ self.y_pred, losses = self.update_labels(self.predict(x), self.centers) clustering_loss = np.mean(losses) """ Step 3: Compute sample weights """ sample_weight = self.compute_sample_weight( losses, epoch, epochs) if use_sp else None time_train = time() - t0_epoch return result
def get_normalized_nmi_weight(input_fixed_img, input_moving_img): nmi = metrics.nmi(input_fixed_img, input_moving_img) return nmi - 1.00591886842159
def fit(self, x, y=None, maxiter=2e4, batch_size=256, tol=1e-3, update_interval=140, save_dir='./results/temp'): print('Update interval', update_interval) save_interval = int(x.shape[0] / batch_size) * 5 # 5 epochs print('Save interval', save_interval) # Step 1: initialize cluster centers using k-means t1 = time() print('Initializing cluster centers with k-means.') kmeans = KMeans(n_clusters=self.n_clusters, n_init=20) y_pred = kmeans.fit_predict(self.encoder.predict(x)) y_pred_last = np.copy(y_pred) self.model.get_layer(name='clustering').set_weights( [kmeans.cluster_centers_]) # Step 2: deep clustering # logging file import csv logfile = open(save_dir + '/dec_log.csv', 'w') logwriter = csv.DictWriter( logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss']) logwriter.writeheader() loss = 0 index = 0 index_array = np.arange(x.shape[0]) for ite in range(int(maxiter)): if ite % update_interval == 0: q = self.model.predict(x, verbose=0) p = self.target_distribution( q) # update the auxiliary target distribution p # evaluate the clustering performance y_pred = q.argmax(1) if y is not None: acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) loss = np.round(loss, 5) logdict = dict(iter=ite, acc=acc, nmi=nmi, ari=ari, loss=loss) logwriter.writerow(logdict) print( 'Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' % (ite, acc, nmi, ari), ' ; loss=', loss) # check stop criterion delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] y_pred_last = np.copy(y_pred) if ite > 0 and delta_label < tol: print('delta_label ', delta_label, '< tol ', tol) print('Reached tolerance threshold. Stopping training.') logfile.close() break # train on batch # if index == 0: # np.random.shuffle(index_array) idx = index_array[index * batch_size:min((index + 1) * batch_size, x.shape[0])] loss = self.model.train_on_batch(x=x[idx], y=p[idx]) index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0 # save intermediate model if ite % save_interval == 0: print('saving model to:', save_dir + '/DEC_model_' + str(ite) + '.h5') self.model.save_weights(save_dir + '/DEC_model_' + str(ite) + '.h5') ite += 1 # save the trained model logfile.close() print('saving model to:', save_dir + '/DEC_model_final.h5') self.model.save_weights(save_dir + '/DEC_model_final.h5') return y_pred
def fit(self, x_train, x_val, x_test, model_name, outdir, df_columns, y=None, epoch=500, batch_size=256, update_interval=5, early_stopping=20, tol=0.01): print('Update interval', update_interval) # Step 1: initialize cluster centers using k-means t1 = time() print('Initializing cluster centers with k-means.') kmeans = KMeans(n_clusters=self.n_clusters, n_init=20) encoder_out = self.encoder.predict(x_train) y_pred = kmeans.fit_predict(encoder_out) # y_pred = kmeans.fit_predict(x_train) if y is not None: acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) print('kmans: acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari)) X_embedded = TSNE(n_components=2).fit_transform(encoder_out) plt.figure(figsize=(12, 10)) plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y) plt.colorbar() plt.show() print(np.bincount(y_pred)) y_pred_last = np.copy(y_pred) self.model.get_layer(name='clustering').set_weights( [kmeans.cluster_centers_]) # for ite in range(int(epoch)): # if ite % update_interval == 0: # q,_,_ = self.model.predict(x_train, verbose=0) # p = self.target_distribution(q) # update the auxiliary target distribution p # y0 = np.zeros_like(x_train) # self.model.fit(x=x_train, y=[p, y0, x_train], batch_size=batch_size) # Step 2: deep clustering index = 0 index_array_train = np.arange(x_train.shape[0]) index_array_val = np.arange(x_val.shape[0]) cost_val = [] cost_train = [] for ite in range(int(epoch)): if ite % update_interval == 0: q, _, _ = self.model.predict(x_train, verbose=0) p = self.target_distribution( q) # update the auxiliary target distribution p y_pred = q.argmax(1) delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] print("delta label:{}".format(delta_label)) y_pred_last = np.copy(y_pred) if y is not None: acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) print('acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari)) print(np.bincount(y_pred)) if ite > update_interval and delta_label < tol: # and np.mean(cost_val[-(early_stopping + 1):-1]) > \ # np.mean(cost_val[-(early_stopping*2 + 1):-(early_stopping + 1)])\ # and np.mean(cost_train[-(early_stopping + 1):-1]) < \ # np.mean(cost_train[-(early_stopping*2 + 1):-(early_stopping + 1)]): print("Early stopping...") break # train on batch tot_train_loss = 0. tot_sparse_loss = 0. tot_mse_loss = 0. tot_cluster_loss = 0. while True: if index == 0: np.random.shuffle(index_array_train) idx = index_array_train[index * batch_size:min( (index + 1) * batch_size, x_train.shape[0])] y0 = np.zeros_like(x_train[idx]) # cluster_loss, sparse_loss, mse_loss = self.model.train_on_batch(x=x_train[idx], y=[p[idx], y0, x_train[idx]]) loss, cluster_loss, sparse_loss, mse_loss = self.model.train_on_batch( x=x_train[idx], y=[p[idx], y0, x_train[idx]]) index = index + 1 if ( index + 2) * batch_size <= x_train.shape[0] else 0 tot_train_loss += loss * len(idx) tot_cluster_loss += cluster_loss * len(idx) tot_mse_loss += mse_loss * len(idx) tot_sparse_loss += sparse_loss * len(idx) if index == 0: break avg_train_loss = tot_train_loss / x_train.shape[0] avg_cluster_loss = tot_cluster_loss / x_train.shape[0] avg_mse_loss = tot_mse_loss / x_train.shape[0] avg_sparse_loss = tot_sparse_loss / x_train.shape[0] print( "epoch {}th train, train_loss :{:.6f}, cluster_loss: {:.6f}, mse_loss: {:.6f}, sparse_loss: {:.6f}\n" .format(ite + 1, avg_train_loss, avg_cluster_loss, avg_mse_loss, avg_sparse_loss)) cost_train.append(avg_train_loss) # # tot_val_loss = 0. # tot_sparse_loss = 0. # tot_mse_loss = 0. # tot_cluster_loss = 0. # while True: # if index == 0: # np.random.shuffle(index_array_val) # idx = index_array_val[index * batch_size: min((index+1) * batch_size, x_val.shape[0])] # y0 = np.zeros_like(x_val[idx]) # loss, cluster_loss, sparse_loss, mse_loss = self.model.test_on_batch(x=x_val[idx], y=[p[idx], y0, x_val[idx]]) # index = index + 1 if (index + 2) * batch_size <= x_val.shape[0] else 0 # tot_cluster_loss += cluster_loss *len(idx) # tot_mse_loss += mse_loss *len(idx) # tot_sparse_loss += sparse_loss *len(idx) # tot_val_loss += loss * len(idx) # if index==0: # break # avg_val_loss = tot_val_loss / x_val.shape[0] # avg_cluster_loss = tot_cluster_loss / x_val.shape[0] # avg_mse_loss = tot_mse_loss / x_val.shape[0] # avg_sparse_loss = tot_sparse_loss / x_val.shape[0] # print("epoch {}th validate, loss: {:.6f}, cluster_loss: {:.6f}, mse_loss: {:.6f}, sparse_loss: {:.6f}\n".format(ite + 1, # avg_val_loss, avg_cluster_loss, # avg_mse_loss, # avg_sparse_loss)) # cost_val.append(avg_val_loss) print('training time: ', time() - t1) # save the trained model print("saving predict data...") encoder_out = self.encoder.predict(x_test) q, decoder_out, _ = self.model.predict(x_test) y_pred = q.argmax(1) if y is not None: print("orginal cluster proportion: {}".format(np.bincount(y))) acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) print('acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari)) X_embedded = TSNE(n_components=2).fit_transform(encoder_out) plt.figure(figsize=(12, 10)) plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y) plt.colorbar() plt.show() print(np.bincount(y_pred)) print(np.bincount(y_pred)) y_pred = kmeans.fit_predict(encoder_out) if y is not None: acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) print('kmeans : acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari)) print(np.bincount(y_pred)) decoder_sub = decoder_out + x_test df = pd.DataFrame(decoder_out, columns=df_columns) df_replace = pd.DataFrame(decoder_sub, columns=df_columns) outDir = os.path.join(outdir, model_name) if os.path.exists(outDir) == False: os.makedirs(outDir) outPath = os.path.join(outDir, "{}.{}.complete".format(model_name, ite)) df.to_csv(outPath, index=None, float_format='%.4f') df_replace.to_csv(outPath.replace(".complete", ".complete.sub"), index=None, float_format='%.4f') pd.DataFrame(encoder_out).to_csv(outPath.replace( ".complete", ".encoder.out"), float_format='%.4f') print("saving done!")
def fit(self, x, y=None, maxiter=2e4, batch_size=256, tol=1e-3, update_interval=140, save_dir='./results/temp', aug_cluster=False): print('Begin clustering:', '-' * 60) print('Update interval', update_interval) save_interval = int(maxiter) # only save the initial and final model print('Save interval', save_interval) # Step 1: initialize cluster centers using k-means t1 = time() print('Initializing cluster centers with k-means.') kmeans = KMeans(n_clusters=self.n_clusters, n_init=20) features = self.encoder.predict(x) y_pred = kmeans.fit_predict(features) y_pred_last = np.copy(y_pred) self.model.get_layer(name='clustering').set_weights( [kmeans.cluster_centers_]) # Step 2: deep clustering # logging file import csv, os if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = open(save_dir + '/log.csv', 'w') logwriter = csv.DictWriter( logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss']) logwriter.writeheader() loss = 0 index = 0 index_array = np.arange(x.shape[0]) for ite in range(int(maxiter)): if ite % update_interval == 0: q = self.predict(x) p = self.target_distribution( q) # update the auxiliary target distribution p # evaluate the clustering performance y_pred = q.argmax(1) avg_loss = loss / update_interval loss = 0. if y is not None: acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) logdict = dict(iter=ite, acc=acc, nmi=nmi, ari=ari, loss=avg_loss) logwriter.writerow(logdict) logfile.flush() print('Iter %d: acc=%.5f, nmi=%.5f, ari=%.5f; loss=%.5f' % (ite, acc, nmi, ari, avg_loss)) # check stop criterion delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] y_pred_last = np.copy(y_pred) if ite > 0 and delta_label < tol: print('delta_label ', delta_label, '< tol ', tol) print('Reached tolerance threshold. Stopping training.') logfile.close() break # save intermediate model if ite % save_interval == 0: print('saving model to:', save_dir + '/model_' + str(ite) + '.h5') self.model.save_weights(save_dir + '/model_' + str(ite) + '.h5') # train on batch idx = index_array[index * batch_size:min((index + 1) * batch_size, x.shape[0])] x_batch = self.random_transform(x[idx]) if aug_cluster else x[idx] loss += self.train_on_batch(x=x_batch, y=p[idx]) index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0 ite += 1 # save the trained model logfile.close() print('saving model to:', save_dir + '/model_final.h5') self.model.save_weights(save_dir + '/model_final.h5') print('Clustering time: %ds' % (time() - t1)) print('End clustering:', '-' * 60) return y_pred
from multiEmbedding import metaembedding_load_search_snippet2,metaembedding_load_stackoverflow,load_tweet89,load_20ngnews # from Tfidf import tf,tfidf filename = 'data/20ngnews/20ngnews.txt' x,y = load_20ngnews() # x_tf = tf(filename) # x_tfidf = tfidf(filename) # print("x.shape: ",x.shape) # print("x_tf.shape: ",x_tfidf.shape) # print("x_tfidf.shape: ",x_tfidf.shape) clusternum = len(set(y)) print("clusternum:",clusternum) kmeans = KMeans(n_clusters= clusternum, n_init= 100) y_pred = kmeans.fit_predict(x) acc = np.round(metrics.acc(y, y_pred), 5) # nmi值 nmi = np.round(metrics.nmi(y, y_pred), 5) print('acc = %.5f, nmi = %.5f' % ( acc, nmi))
def run_clustering(doc_embeddings, dims, batch_size=16, n_epochs=1, update_interval=80, tol=0.001, y_real=None, device="cpu"): inputs = torch.from_numpy(doc_embeddings).to(device) dataset = TensorDataset(inputs) dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False) model = HybridModel(dims) enc_dec_model = {k[2:]: v for k, v in torch.load("enc_dec_model").items()} model.encoder.load_state_dict(enc_dec_model, strict=False) model.decoder.load_state_dict(enc_dec_model, strict=False) model = model.to(device) if os.path.exists("clustering_model"): model.load_state_dict(torch.load("clustering_model")) print("clustering model load from ckpt") model.train() optimizer = Adam(model.parameters(), lr=1e-3) criterion1 = nn.KLDivLoss(reduction="batchmean") criterion2 = nn.SmoothL1Loss() y_pred_last = np.zeros([doc_embeddings.shape[0]]) is_end = False bst_model_acc = 0.0 for epoch in range(n_epochs): if is_end: break batch_num = 1 train_loss = 0.0 for data in dataloader: if (batch_num - 1) % update_interval == 0: model.eval() with torch.no_grad(): _, q = model(inputs) p = torch.Tensor(target_distribution( q.cpu().numpy())).to(device) y_pred = q.cpu().numpy().argmax(1) if y_real is not None: acc = np.round(metrics.acc(y_real, y_pred), 5) nmi = np.round(metrics.nmi(y_real, y_pred), 5) ari = np.round(metrics.ari(y_real, y_pred), 5) print( 'Epoch %d, Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' % ((epoch + 1), batch_num, acc, nmi, ari)) if acc > bst_model_acc: torch.save(model.state_dict(), "clustering_model") bst_model_acc = acc # check stop criterion - model convergence delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] # print("delta_label: {}".format(delta_label)) y_pred_last = np.copy(y_pred) model.train() if delta_label < tol: print('delta_label ', delta_label, '< tol ', tol) print('Reached tolerance threshold. Stopping training.') is_end = True break x_batch = data[0] # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize y_hat_dec_batch, y_hat_clu_batch = model(x_batch) y_batch = p[((batch_num - 1) * batch_size):(batch_num * batch_size), :] loss1 = 1e-1 * criterion1(torch.log(y_hat_clu_batch), y_batch) # torch.from_numpy(y_batch)) loss2 = criterion2(y_hat_dec_batch, x_batch) loss = loss1 + loss2 loss.backward() train_loss += loss.item() optimizer.step() if batch_num - 1 % update_interval == 0: print("kld loss: {}, mse loss: {}".format(loss1, loss2)) print("step loss: {}".format(train_loss / update_interval)) train_loss = 0.0 batch_num += 1 torch.save(model.state_dict(), "clustering_model") model.eval() with torch.no_grad(): _, q = model(inputs) q = q.cpu().numpy() return q.argmax(1)