Example #1
0
    def epochBegin(self, epoch):
        if epoch % self.decay_n == 0 and epoch != 0:
            self.lr_decay()

        gamma = self.gamma_output.predict(self.inputs, batch_size=batch_size)
        pred = np.argmax(gamma, axis=1)
        acc = self.cluster_acc(pred, self.Y)

        Y = np.reshape(self.Y, [self.Y.shape[0]])
        nmi = metrics.nmi(Y, pred)
        ari = metrics.ari(Y, pred)
        purity = self.purity_score(Y, pred)
        global accuracy
        accuracy = []
        accuracy += [acc[0]]
        if epoch > 0:
            print('ACC:%0.8f' % acc[0])
            print('NMI:', nmi)
            print('ARI:', ari)
            print('Purity', purity)
        if epoch == 1 and dataset == 'har' and acc[0] < 0.77:
            print(
                '=========== HAR dataset:bad init!Please run again! ============'
            )
            sys.exit(0)
Example #2
0
def match(y,cl):
    cl=np.array(cl)
    y=np.array(y)
    acc = np.round(metrics.acc(y, cl), 5)
    nmi = np.round(metrics.nmi(y, cl), 5)
    ari = np.round(metrics.ari(y, cl), 5)
    return acc,nmi,ari
Example #3
0
 def metric(self, y, y_pred):
     acc = np.round(metrics.acc(y, y_pred), 5)
     nmi = np.round(metrics.nmi(y, y_pred), 5)
     ari = np.round(metrics.ari(y, y_pred), 5)
     print('acc:', acc)
     print('nmi:', nmi)
     print('ari:', ari)
Example #4
0
def train(args):
    # get data and model
    (x, y), model = _get_data_and_model(args)

    # split train validation data
    if y is None:
        x_train, x_val = train_test_split(x, test_size=0.1)
        y_val = None
        y_train = None
    else:
        x_train, x_val, y_train, y_val = train_test_split(x,
                                                          y,
                                                          stratify=y,
                                                          test_size=0.1)

    model.model.summary()

    # pretraining
    t0 = time()
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    if args.pretrained_weights is not None and os.path.exists(
            args.pretrained_weights):  # load pretrained weights
        model.autoencoder.load_weights(args.pretrained_weights)
    else:  # train
        pretrain_optimizer = SGD(1.0, 0.9) if args.method in [
            'FcDEC', 'FcIDEC', 'FcDEC-DA', 'FcIDEC-DA'
        ] else 'adam'
        model.pretrain(x_train,
                       y_train,
                       x_val,
                       y_val,
                       optimizer=pretrain_optimizer,
                       epochs=args.pretrain_epochs,
                       batch_size=args.batch_size,
                       save_dir=args.save_dir,
                       verbose=args.verbose,
                       aug_pretrain=args.aug_pretrain)
    t1 = time()
    print("Time for pretraining: %ds" % (t1 - t0))

    # clustering
    y_pred = model.fit(x,
                       y,
                       maxiter=args.maxiter,
                       batch_size=args.batch_size,
                       update_interval=args.update_interval,
                       save_dir=args.save_dir,
                       aug_cluster=args.aug_cluster)
    if y is not None:
        print('Final: acc=%.4f, nmi=%.4f, ari=%.4f' % (metrics.acc(
            y, y_pred), metrics.nmi(y, y_pred), metrics.ari(y, y_pred)))
    t2 = time()
    print("Time for pretaining, clustering and total: (%ds, %ds, %ds)" %
          (t1 - t0, t2 - t1, t2 - t0))
    print('=' * 60)
Example #5
0
def test(args):
    assert args.weights is not None
    (x, y), model = _get_data_and_model(args)
    model.model.summary()

    print('Begin testing:', '-' * 60)
    model.load_weights(args.weights)
    y_pred = model.predict_labels(x)
    print('acc=%.4f, nmi=%.4f, ari=%.4f' % (metrics.acc(
        y, y_pred), metrics.nmi(y, y_pred), metrics.ari(y, y_pred)))
    print('End testing:', '-' * 60)
def kmeans_():

    # use features for clustering
    from sklearn.cluster import KMeans
    km = KMeans(n_clusters=N, init='k-means++')
    #features = np.reshape(x_train, newshape=(features.shape[0], -1))
    km_trans = km.fit_transform(x_train)
    pred = km.predict(x_train)
    print pred.shape
    print('acc=', met.acc(y_train, pred), 'nmi=', met.nmi(y_train,
                                                          pred), 'ari=',
          met.ari(y_train, pred))
    return km_trans, pred
Example #7
0
    def fit(self, x, y=None, save_dir='./results/temp'):
        # print('Begin training:', '-' * 60)

        t1 = time()
        print(
            '******************** Use Denpeak to Cluster ************************'
        )

        features = self.encoder.predict(x)
        print("features shape:", features.shape)
        features = TSNE(n_components=2).fit_transform(features)
        # np.savetxt("features.txt", features)
        print("features shape:", features.shape)
        y_pred, y_border, center_num, dc_percent, dc = DenPeakCluster(features)
        print('saving picture to:', save_dir + '/2D.png')
        plt.cla()
        plt.scatter(features[:, 0], features[:, 1], c=y_pred, s=0.5, alpha=0.5)
        plt.savefig(save_dir + '/2D.png')
        np.savetxt(save_dir + '/dc_coeff.txt', [dc_percent, dc])

        # logging file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(save_dir + '/log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile,
            fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss', 'center_num'])
        logwriter.writeheader()

        acc = np.round(metrics.acc(y, y_pred), 5)
        nmi = np.round(metrics.nmi(y, y_pred), 5)
        ari = np.round(metrics.ari(y, y_pred), 5)
        # if acc>=0.95:
        np.savetxt(save_dir + '/features.txt', features)
        np.savetxt(save_dir + '/labels.txt', y_pred)
        np.savetxt(save_dir + '/border.txt', y_border)
        from Draw_border import draw
        draw(save_dir)
        logdict = dict(iter=0,
                       acc=acc,
                       nmi=nmi,
                       ari=ari,
                       center_num=center_num)
        logwriter.writerow(logdict)
        logfile.flush()
        print('Iter %d: acc=%.5f, nmi=%.5f, ari=%.5f; center_num=%d' %
              (0, acc, nmi, ari, center_num))
        logfile.close()

        return y_pred
Example #8
0
def train_feature(net1, train_data):
    map_dict = read_pkl()
    if torch.cuda.is_available():
        net1 = torch.nn.DataParallel(net1, device_ids=[0])
        net1 = net1.cuda()
    prev_time = datetime.now()

    for i_dir in range(classnum):
        if not os.path.isdir('./data/' + str(i_dir)):
            os.makedirs('./data/' + str(i_dir))
    label_np = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0] * 10).reshape(10, 10)
    # label_np = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    #                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0] * 20).reshape(20, 20)

    label2 = []
    idx2 = []
    for im, label in tqdm(train_data, desc="Processing train data: "):
        im = im.cuda()
        feat = net1(im)
        for i in range(feat.size(0)):
            distance_list = list()
            for ui_50D_label in map_dict.values():
                distance = sum(sum((ui_50D_label.float().cuda() - feat[i])**2))
                distance_list.append(distance.item())
            idx = distance_list.index(min(distance_list))
            save_image(
                inver_transform2(im[i]), './data/' + str(idx) + '/' +
                str(random.randint(1, 10000000)) + '.png')
            label_np[idx][label[i].item()] += 1
            label2.append(idx)
        label1 = label.numpy()
        # for _,i in enumerate(label):
        #     idx2.append(i)
        for i in label1:
            idx2.append(i)

    t2 = np.array(idx2)
    t1 = np.array(label2)
    # print(t2.shape)
    # t2 = t2.reshape([t1.size,-1]).squeeze(0)
    print('acc=%.4f, nmi=%.4f, ari=%.4f' %
          (metrics.acc(t1, t2), metrics.nmi(t1, t2), metrics.ari(t1, t2)))

    corr_num = 0
    for item in label_np:
        corr_num += item.max()
    corr = corr_num / label_np.sum()
    print(corr)
    np.save('./model/MNIST/feature/' + str(feat.size(1)) + '_' + '.npy',
            label_np)
Example #9
0
    def gmm_kmeans_cluster(self, dataloader):
        use_cuda = torch.cuda.is_available()
        if use_cuda:
            self.cuda()

        self.eval()
        data = []
        Y = []
        for batch_idx, (inputs, y) in enumerate(dataloader):
            inputs = inputs.view(inputs.size(0), -1).float()
            if use_cuda:
                inputs = inputs.cuda()
            inputs = Variable(inputs)
            _, _, _, mu, _ = self.forward(inputs)
            data.append(mu.data.cpu().numpy())
            Y.append(y.numpy())
        data = np.concatenate(data)
        Y = np.concatenate(Y)
        gmm = GaussianMixture(n_components=self.n_centroids,
                              covariance_type='full')
        gmm.fit(data)
        y_pred_gmm = gmm.predict(data)
        acc = np.round(metrics.acc(Y, y_pred_gmm), 5)
        nmi = np.round(metrics.nmi(Y, y_pred_gmm), 5)
        ari = np.round(metrics.ari(Y, y_pred_gmm), 5)
        print(
            'GMM fit of AutoEncoder embedding: acc = %.5f, nmi = %.5f, ari = %.5f'
            % (acc, nmi, ari))

        km = KMeans(n_clusters=self.n_centroids, n_init=20)
        y_pred_kmeans = km.fit_predict(data)
        acc = np.round(metrics.acc(Y, y_pred_kmeans), 5)
        nmi = np.round(metrics.nmi(Y, y_pred_kmeans), 5)
        ari = np.round(metrics.ari(Y, y_pred_kmeans), 5)
        print(
            'Kmeans clustering of AutoEncoder embedding: acc = %.5f, nmi = %.5f, ari = %.5f'
            % (acc, nmi, ari))
Example #10
0
def train_feature(net1, train_data):
    #
    if torch.cuda.is_available():
        net1 = torch.nn.DataParallel(net1, device_ids=[0])
        net1 = net1.cuda()
    #
    for i_dir in range(classnum):
        if not os.path.isdir('./data/' + str(i_dir)):
            os.makedirs('./data/' + str(i_dir))
    label_np = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0] * 10).reshape(10, 10)

    #
    label2 = []
    idx2 = []
    for im, label in tqdm(train_data, desc="Processing train data: "):
        # print(label)
        im = im.cuda()
        _, feat = net1(im)
        for i in range(feat.size(0)):
            distance = feat[i].cpu().numpy().tolist()
            idx = distance.index(max(distance))
            save_image(
                inver_transform2(im[i]), './data/' + str(idx) + '/' +
                str(random.randint(1, 10000000)) + '.png')
            # MATRIX
            label_np[idx][label[i].item()] += 1
            #
            label2.append(idx)
        label1 = label.numpy()
        for i in label1:
            idx2.append(i)

    t2 = np.array(idx2)
    t1 = np.array(label2)
    print('acc=%.4f, nmi=%.4f, ari=%.4f' %
          (metrics.acc(t1, t2), metrics.nmi(t1, t2), metrics.ari(t1, t2)))
    ##############################
    np.save(File + str(feat.size(1)) + '_' + '.npy', label_np)
Example #11
0
def run_net(data, params):
    #
    # UNPACK DATA
    #

    x_train_unlabeled, y_train_unlabeled, x_val, y_val, x_test, y_test = data[
        'spectral']['train_and_test']

    print(params['input_shape'])
    inputs_vae = Input(shape=params['input_shape'], name='inputs_vae')
    ConvAE = Conv.ConvAE(inputs_vae, params)
    try:
        ConvAE.vae.load_weights('vae_mnist.h5')
    except OSError:
        print('No pretrained weights available...')

    lh = LearningHandler(lr=params['spec_lr'],
                         drop=params['spec_drop'],
                         lr_tensor=ConvAE.learning_rate,
                         patience=params['spec_patience'])

    lh.on_train_begin()

    n_epochs = 5000
    losses_vae = np.empty((n_epochs, ))
    homo_plot = np.empty((n_epochs, ))
    nmi_plot = np.empty((n_epochs, ))
    ari_plot = np.empty((n_epochs, ))

    y_val = np.squeeze(np.asarray(y_val).ravel())  # squeeze into 1D array

    start_time = time.time()
    for i in range(n_epochs):
        # if i==0:
        x_recon, _, x_val_y = ConvAE.vae.predict(x_val)
        losses_vae[i] = ConvAE.train_vae(x_val, x_val_y, params['batch_size'])
        #x_val_y = ConvAE.vae.predict(x_val)[2]
        #y_sp = x_val_y.argmax(axis=1)
        #print_accuracy(y_sp, y_val, params['n_clusters'])
        print("Epoch: {}, loss={:2f}".format(i, losses_vae[i]))

        os.makedirs('vae', exist_ok=True)
        os.makedirs('vae_umap', exist_ok=True)

        fig, axs = plt.subplots(3, 4, figsize=(25, 18))
        fig.subplots_adjust(wspace=0.25)

        embedding = ConvAE.encoder.predict(x_val)
        kmeans = KMeans(n_clusters=params['n_clusters'], n_init=30)
        predicted_labels = kmeans.fit_predict(
            embedding)  # cluster on current embeddings for metric eval
        _, confusion_matrix = get_y_preds(predicted_labels, y_val,
                                          params['n_clusters'])

        homo_plot[i] = metrics.acc(y_val, predicted_labels)
        nmi_plot[i] = metrics.nmi(y_val, predicted_labels)
        ari_plot[i] = metrics.ari(y_val, predicted_labels)

        tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
        Z_tsne = tsne.fit_transform(embedding)
        sc = axs[1][0].scatter(Z_tsne[:, 0],
                               Z_tsne[:, 1],
                               s=2,
                               c=y_train_unlabeled,
                               cmap=plt.cm.get_cmap("jet", 14))
        axs[1][0].set_title('t-SNE Embeddings')
        axs[1][0].set_xlabel('t-SNE 1')
        axs[1][0].set_ylabel('t-SNE 2')
        axs[1][0].set_xticks([])
        axs[1][0].set_yticks([])
        axs[1][0].spines['right'].set_visible(False)
        axs[1][0].spines['top'].set_visible(False)
        divider = make_axes_locatable(axs[1][0])
        cax = divider.append_axes('right', size='15%', pad=0.05)
        cbar = fig.colorbar(sc,
                            cax=cax,
                            orientation='vertical',
                            ticks=range(params['n_clusters']))
        cbar.ax.set_yticklabels(
            params['cluster_names'])  # vertically oriented colorbar
        # Create offset transform by 5 points in x direction
        dx = 0 / 72.
        dy = -5 / 72.
        offset = matplotlib.transforms.ScaledTranslation(
            dx, dy, fig.dpi_scale_trans)

        # apply offset transform to all cluster ticklabels.
        for label in cbar.ax.yaxis.get_majorticklabels():
            label.set_transform(label.get_transform() + offset)

        reducer = umap.UMAP(transform_seed=36, random_state=36)
        matrix_reduce = reducer.fit_transform(embedding)
        sc = axs[1][1].scatter(matrix_reduce[:, 0],
                               matrix_reduce[:, 1],
                               s=2,
                               c=y_train_unlabeled,
                               cmap=plt.cm.get_cmap("jet", 14))
        axs[1][1].set_title('UMAP Embeddings')
        axs[1][1].set_xlabel('UMAP 1')
        axs[1][1].set_ylabel('UMAP 2')
        axs[1][1].set_xticks([])
        axs[1][1].set_yticks([])
        # Hide the right and top spines
        axs[1][1].spines['right'].set_visible(False)
        axs[1][1].spines['top'].set_visible(False)

        im = axs[1][2].imshow(confusion_matrix, cmap='YlOrRd')
        axs[1][2].set_title('Confusion Matrix')
        axs[1][2].set_xticks(range(params['n_clusters']))
        axs[1][2].set_yticks(range(params['n_clusters']))
        axs[1][2].set_xticklabels(params['cluster_names'], fontsize=8)
        axs[1][2].set_yticklabels(params['cluster_names'], fontsize=8)
        divider = make_axes_locatable(axs[1][2])
        cax = divider.append_axes('right', size='10%', pad=0.05)
        cbar = fig.colorbar(im, cax=cax, orientation='vertical', ticks=[])

        axs[0][0].plot(losses_vae[:i + 1])
        axs[0][0].set_title('VAE Loss')
        axs[0][0].set_xlabel('epochs')

        axs[0][1].plot(homo_plot[:i + 1])
        axs[0][1].set_title('Homogeneity')
        axs[0][1].set_xlabel('epochs')
        axs[0][1].set_ylim(0, 1)

        axs[0][2].plot(ari_plot[:i + 1])
        axs[0][2].set_title('ARI')
        axs[0][2].set_xlabel('epochs')
        axs[0][2].set_ylim(0, 1)

        axs[0][3].plot(nmi_plot[:i + 1])
        axs[0][3].set_title('NMI')
        axs[0][3].set_xlabel('epochs')
        axs[0][3].set_ylim(0, 1)

        #reconstructed_cell = ConvAE.vae.predict(x_val[:1, ...])[0, ..., 0]
        cell_tile = x_val[0, ..., 0]
        cell_tile = cell_tile[:, :64]
        x_recon = x_recon[0, ..., 0]
        reconstructed_cell_tile = x_recon[:, :64]
        reconstructed_cell_tile = np.flipud(reconstructed_cell_tile)
        cell_heatmap = np.vstack((cell_tile, reconstructed_cell_tile))
        axs[1][3].imshow(cell_heatmap, cmap='Reds')
        axs[1][3].set_xticks([])
        axs[1][3].set_yticks([])
        axs[1][3].spines['right'].set_visible(False)
        axs[1][3].spines['top'].set_visible(False)
        axs[1][3].spines['left'].set_visible(False)
        axs[1][3].spines['bottom'].set_visible(False)

        # get eigenvalues and eigenvectors
        scale = get_scale(embedding, params['batch_size'], params['scale_nbr'])
        values, vectors = spectral_clustering(embedding, scale,
                                              params['n_nbrs'],
                                              params['affinity'])

        # sort, then store the top n_clusters=2
        values_idx = np.argsort(values)
        x_spectral_clustering = vectors[:, values_idx[:params['n_clusters']]]

        # do kmeans clustering in this subspace
        y_spectral_clustering = KMeans(
            n_clusters=params['n_clusters']).fit_predict(
                vectors[:, values_idx[:params['n_clusters']]])

        tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
        Z_tsne = tsne.fit_transform(x_spectral_clustering)
        sc = axs[2][0].scatter(Z_tsne[:, 0],
                               Z_tsne[:, 1],
                               s=2,
                               c=y_train_unlabeled,
                               cmap=plt.cm.get_cmap("jet", 14))
        axs[2][0].set_title('Spectral Clusters (t-SNE) True Labels')
        axs[2][0].set_xlabel('t-SNE 1')
        axs[2][0].set_ylabel('t-SNE 2')
        axs[2][0].set_xticks([])
        axs[2][0].set_yticks([])
        axs[2][0].spines['right'].set_visible(False)
        axs[2][0].spines['top'].set_visible(False)

        reducer = umap.UMAP(transform_seed=36, random_state=36)
        matrix_reduce = reducer.fit_transform(x_spectral_clustering)
        axs[2][1].scatter(matrix_reduce[:, 0],
                          matrix_reduce[:, 1],
                          s=2,
                          c=y_spectral_clustering,
                          cmap=plt.cm.get_cmap("jet", 14))
        axs[2][1].set_title('Spectral Clusters (UMAP)')
        axs[2][1].set_xlabel('UMAP 1')
        axs[2][1].set_ylabel('UMAP 2')
        axs[2][1].set_xticks([])
        axs[2][1].set_yticks([])
        # Hide the right and top spines
        axs[2][1].spines['right'].set_visible(False)
        axs[2][1].spines['top'].set_visible(False)

        axs[2][2].scatter(matrix_reduce[:, 0],
                          matrix_reduce[:, 1],
                          s=2,
                          c=y_train_unlabeled,
                          cmap=plt.cm.get_cmap("jet", 14))
        axs[2][2].set_title('True Labels (UMAP)')
        axs[2][2].set_xlabel('UMAP 1')
        axs[2][2].set_ylabel('UMAP 2')
        axs[2][2].set_xticks([])
        axs[2][2].set_yticks([])
        # Hide the right and top spines
        axs[2][2].spines['right'].set_visible(False)
        axs[2][2].spines['top'].set_visible(False)

        axs[2][3].hist(x_spectral_clustering)
        axs[2][3].set_title("histogram of true eigenvectors")

        train_time = str(
            datetime.timedelta(seconds=(int(time.time() - start_time))))
        n_matrices = (i + 1) * params['batch_size'] * 100
        fig.suptitle('Trained on ' + '{:,}'.format(n_matrices) + ' cells\n' +
                     train_time)

        plt.savefig('vae/%d.png' % i)
        plt.close()

        plt.close()

        if i > 1:
            if np.abs(losses_vae[i] - losses_vae[i - 1]) < 0.0001:
                print('STOPPING EARLY')
                break

    print("finished training")

    plt.plot(losses_vae)
    plt.title('VAE Loss')
    plt.show()

    x_val_y = ConvAE.vae.predict(x_val)[2]
    # x_val_y = ConvAE.classfier.predict(x_val_lp)
    y_sp = x_val_y.argmax(axis=1)
    print_accuracy(y_sp, y_val, params['n_clusters'])
    from sklearn.metrics import normalized_mutual_info_score as nmi
    y_val = np.squeeze(np.asarray(y_val).ravel())  # squeeze into 1D array
    print(y_sp.shape, y_val.shape)
    nmi_score1 = nmi(y_sp, y_val)
    print('NMI: ' + str(np.round(nmi_score1, 4)))

    embedding = ConvAE.encoder.predict(x_val)
    tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
    Z_tsne = tsne.fit_transform(embedding)
    fig = plt.figure()
    plt.scatter(Z_tsne[:, 0],
                Z_tsne[:, 1],
                s=2,
                c=y_train_unlabeled,
                cmap=plt.cm.get_cmap("jet", 14))
    plt.colorbar(ticks=range(params['n_clusters']))
    plt.show()
def run_clustering(doc_embeddings,
                   dims,
                   batch_size=16,
                   n_epochs=1,
                   update_interval=80,
                   tol=0.001,
                   y_real=None,
                   device="cpu"):

    inputs = torch.from_numpy(doc_embeddings).to(device)
    dataset = TensorDataset(inputs)
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=False)

    model = HybridModel(dims)
    enc_dec_model = {k[2:]: v for k, v in torch.load("enc_dec_model").items()}
    model.encoder.load_state_dict(enc_dec_model, strict=False)
    model.decoder.load_state_dict(enc_dec_model, strict=False)
    model = model.to(device)

    if os.path.exists("clustering_model"):
        model.load_state_dict(torch.load("clustering_model"))
        print("clustering model load from ckpt")

    model.train()

    optimizer = Adam(model.parameters(), lr=1e-3)

    criterion1 = nn.KLDivLoss(reduction="batchmean")
    criterion2 = nn.SmoothL1Loss()

    y_pred_last = np.zeros([doc_embeddings.shape[0]])

    is_end = False
    bst_model_acc = 0.0
    for epoch in range(n_epochs):
        if is_end:
            break
        batch_num = 1
        train_loss = 0.0
        for data in dataloader:

            if (batch_num - 1) % update_interval == 0:
                model.eval()
                with torch.no_grad():
                    _, q = model(inputs)
                    p = torch.Tensor(target_distribution(
                        q.cpu().numpy())).to(device)
                y_pred = q.cpu().numpy().argmax(1)

                if y_real is not None:
                    acc = np.round(metrics.acc(y_real, y_pred), 5)
                    nmi = np.round(metrics.nmi(y_real, y_pred), 5)
                    ari = np.round(metrics.ari(y_real, y_pred), 5)
                    print(
                        'Epoch %d, Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f'
                        % ((epoch + 1), batch_num, acc, nmi, ari))
                    if acc > bst_model_acc:
                        torch.save(model.state_dict(), "clustering_model")
                        bst_model_acc = acc

                # check stop criterion - model convergence
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                # print("delta_label: {}".format(delta_label))
                y_pred_last = np.copy(y_pred)
                model.train()

                if delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    is_end = True
                    break

            x_batch = data[0]
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            y_hat_dec_batch, y_hat_clu_batch = model(x_batch)
            y_batch = p[((batch_num - 1) * batch_size):(batch_num *
                                                        batch_size), :]
            loss1 = 1e-1 * criterion1(torch.log(y_hat_clu_batch),
                                      y_batch)  # torch.from_numpy(y_batch))
            loss2 = criterion2(y_hat_dec_batch, x_batch)
            loss = loss1 + loss2
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
            if batch_num - 1 % update_interval == 0:
                print("kld loss: {}, mse loss: {}".format(loss1, loss2))
                print("step loss: {}".format(train_loss / update_interval))
                train_loss = 0.0
            batch_num += 1

    torch.save(model.state_dict(), "clustering_model")

    model.eval()
    with torch.no_grad():
        _, q = model(inputs)
        q = q.cpu().numpy()
    return q.argmax(1)
Example #13
0
    def fit(self,
            x,
            y=None,
            maxiter=2e4,
            batch_size=256,
            tol=1e-3,
            update_interval=140,
            save_dir='./results/temp',
            aug_cluster=False):
        print('Begin clustering:', '-' * 60)
        print('Update interval', update_interval)
        save_interval = int(maxiter)  # only save the initial and final model
        print('Save interval', save_interval)

        # Step 1: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        features = self.encoder.predict(x)
        y_pred = kmeans.fit_predict(features)
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # Step 2: deep clustering
        # logging file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(save_dir + '/log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss'])
        logwriter.writeheader()

        loss = 0
        index = 0
        index_array = np.arange(x.shape[0])
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q = self.predict(x)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                y_pred = q.argmax(1)
                avg_loss = loss / update_interval
                loss = 0.
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    logdict = dict(iter=ite,
                                   acc=acc,
                                   nmi=nmi,
                                   ari=ari,
                                   loss=avg_loss)
                    logwriter.writerow(logdict)
                    logfile.flush()
                    print('Iter %d: acc=%.5f, nmi=%.5f, ari=%.5f; loss=%.5f' %
                          (ite, acc, nmi, ari, avg_loss))

                # check stop criterion
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                y_pred_last = np.copy(y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    logfile.close()
                    break

            # save intermediate model
            if ite % save_interval == 0:
                print('saving model to:',
                      save_dir + '/model_' + str(ite) + '.h5')
                self.model.save_weights(save_dir + '/model_' + str(ite) +
                                        '.h5')

            # train on batch
            idx = index_array[index * batch_size:min((index + 1) *
                                                     batch_size, x.shape[0])]
            x_batch = self.random_transform(x[idx]) if aug_cluster else x[idx]
            loss += self.train_on_batch(x=x_batch, y=p[idx])
            index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

            ite += 1

        # save the trained model
        logfile.close()
        print('saving model to:', save_dir + '/model_final.h5')
        self.model.save_weights(save_dir + '/model_final.h5')
        print('Clustering time: %ds' % (time() - t1))
        print('End clustering:', '-' * 60)

        return y_pred
Example #14
0
    def fit(self, x_train, optimizer='adam', beta =1, y = None, epoch=500,
            batch_size=256, update_interval=5, early_stopping=20, tol=0.01):

        double_x = np.append(x_train, x_train, axis=1)
        print('Update interval', update_interval)
        # Step 1: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        _, encoder_out = self.autoencoder.predict(x_train)
        y_pred = kmeans.fit_predict(encoder_out)
        if y is not None:
            acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            print('kmeans : acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari))
            X_embedded = TSNE(n_components=2).fit_transform(encoder_out)
            plt.figure(figsize=(12, 10))
            plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
            plt.colorbar()
            plt.show()
        print(np.bincount(y_pred))

        # y_pred = kmeans.fit_predict(x_train)
        y_pred_last = np.copy(y_pred)
        self.cluster_model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])

        self.cluster_model.compile(optimizer=optimizer, loss=[weighted_mse_x, weighted_mse_x, weighted_mse_y, 'kld', 'kld'],
                     loss_weights=[1, 1, args.alpha, args.gamma, args.gamma])
        # for ite in range(int(epoch)):
        #     if ite % update_interval == 0:
        #         q,_,_ = self.model.predict(x_train, verbose=0)
        #         p = self.target_distribution(q)  # update the auxiliary target distribution p
        #     y0 = np.zeros_like(x_train)
        #     self.model.fit(x=x_train, y=[p, y0, x_train], batch_size=batch_size)

        # Step 2: deep clustering
        for ite in range(int(epoch)):
            # train on batch
            if ite % update_interval == 0:
                _, q = self.predict_model.predict(double_x, verbose=0)
                p = self.target_distribution(q)  # update the auxiliary target distribution p
                y_pred = q.argmax(1)
                delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
                print("delta label:{}".format(delta_label))
                y_pred_last = np.copy(y_pred)
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    print('acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari))
                print(np.bincount(y_pred))
                if ite > update_interval and delta_label < tol:
                    print("Early stopping...")
                    break

            self.cluster_model.fit_generator(
                generator=batch_generator_sdne(x_train, batch_size=batch_size, shuffle=True, beta=beta),
                shuffle=False
            )

        print('training time: ', time() - t1)
        # save the trained model

        print("saving predict data...")
        _, encoder_out = self.autoencoder.predict(x_train)
        _, _, _, q, _ = self.cluster_model.predict(double_x, verbose=0)
        #k-means
        y_pred = kmeans.fit_predict(encoder_out)
        if y is not None:
            acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            print('kmeans : acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari))
        print(np.bincount(y_pred))
        #this method
        y_pred = q.argmax(1)
        if y is not None:
            print("orginal cluster proportion: {}".format(np.bincount(y)))
            acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            print('acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari))
            X_embedded = TSNE(n_components=2).fit_transform(encoder_out)
            plt.figure(figsize=(12, 10))
            plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
            plt.colorbar()
            plt.show()
        print(np.bincount(y_pred))
Example #15
0
    def fit(
        self,
        x,
        y=None,
        batch_size=512,  # This was 256, Castellano used 128
        maxiter=2e4,
        tol=1e-3,
        update_interval=140,  # Was 140
        cae_weights=None,
        save_dir="./results/temp",
    ):

        logger.info("Update interval {}".format(update_interval))
        save_interval = int(x.shape[0] / batch_size * 5)
        logger.info("Save interval {}".format(save_interval))

        # Step 1: pretrain if necessary
        t0 = time()
        if not self.pretrained and cae_weights is None:
            logger.info("...pretraining CAE using default hyper-parameters:")
            logger.info("   optimizer='adam';   epochs=200")
            self.pretrain(x, batch_size, save_dir=save_dir)
            self.pretrained = True
        elif cae_weights is not None:
            self.cae.load_weights(cae_weights)
            logger.info("cae_weights is loaded successfully.")

        # Step 2: initialize cluster centers using k-means
        t1 = time()
        logger.info("Initializing cluster centers with k-means.")
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        self.y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(self.y_pred)
        self.model.get_layer(name="clustering").set_weights(
            [kmeans.cluster_centers_])

        # Step 3: deep clustering
        # logging file
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile_path = save_dir + "/dcec_log.csv"
        logfile = open(logfile_path, "w")
        logwriter = csv.DictWriter(
            logfile, fieldnames=["iter", "acc", "nmi", "ari", "L", "Lc", "Lr"])
        logwriter.writeheader()

        overall_log_loss = save_dir + "/dcec_log_all.csv"
        l2 = open(overall_log_loss, "w")
        lw2 = csv.DictWriter(l2, fieldnames=["iter", "L", "Lc", "Lr"])
        lw2.writeheader()

        loss = [0, 0, 0]
        index = 0
        previous_losses = []
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                logger.info("Updating. Iter {}".format(ite))
                q, _ = self.model.predict(x, verbose=0)
                # model.predict() causes a memory leak in tf2. So, use model(). See notes above
                # q, _ = self.model(x_tf, training=False)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                self.y_pred = q.argmax(1)
                if y is not None:
                    logger.info("{} calculating acc".format(ite))
                    acc = np.round(metrics.acc(y, self.y_pred), 5)
                    nmi = np.round(metrics.nmi(y, self.y_pred), 5)
                    ari = np.round(metrics.ari(y, self.y_pred), 5)
                    loss = np.round(loss, 5)
                    logdict = dict(iter=ite,
                                   acc=acc,
                                   nmi=nmi,
                                   ari=ari,
                                   L=loss[0],
                                   Lc=loss[1],
                                   Lr=loss[2])
                    logwriter.writerow(logdict)
                    logger.info(
                        "Iter {}: Acc {}, nmi {}, ari {}; loss={}".format(
                            ite, acc, nmi, ari, loss))

                loss_dict = {
                    "iter": ite,
                    "L": loss[0],
                    "Lc": loss[1],
                    "Lr": loss[2]
                }
                logwriter.writerow(loss_dict)
                logger.info("iter {i}; L {L}; Lc {Lc}; Lr {Lr}".format(
                    i=ite, **loss_dict))

                logger.info("Evaluating full loss")
                loss_all = self.model.evaluate(x,
                                               y=[p, x],
                                               batch_size=batch_size,
                                               verbose=0)
                previous_losses.append(loss_all[0])
                ld = {
                    "iter": ite,
                    "L": loss_all[0],
                    "Lc": loss_all[1],
                    "Lr": loss_all[2]
                }
                logger.info(
                    "Overall loss. iter {iter}; L {L}; Lc {Lc}; Lr {Lr}".
                    format(**ld))
                lw2.writerow(ld)

                # check stop criterion
                delta_label = np.sum(self.y_pred != y_pred_last).astype(
                    np.float32) / self.y_pred.shape[0]
                logger.info("delta_label={}".format(delta_label))
                y_pred_last = np.copy(self.y_pred)
                if self.n_clusters > 1 and ite > 0 and delta_label < tol:
                    logger.info("delta_label {} < tol {}".format(
                        delta_label, tol))
                    logger.info(
                        "Reached tolerance threshold. Stopping training.")
                    logfile.close()
                    break
                elif self.n_clusters == 1 and len(
                        previous_losses) >= 3 and self.should_stop(
                            previous_losses):
                    logger.info(
                        "Stopping criteria reached: Last 3 losses {}".format(
                            previous_losses[-3:]))
                    break

            # train on batch
            if (index + 1) * batch_size > x.shape[0]:
                loss = self.model.train_on_batch(
                    x=x[index * batch_size::],
                    y=[p[index * batch_size::], x[index * batch_size::]])
                index = 0
            else:
                loss = self.model.train_on_batch(
                    x=x[index * batch_size:(index + 1) * batch_size],
                    y=[
                        p[index * batch_size:(index + 1) * batch_size],
                        x[index * batch_size:(index + 1) * batch_size],
                    ],
                )
                index += 1

            loss_dict = {
                "iter": ite,
                "L": loss[0],
                "Lc": loss[1],
                "Lr": loss[2]
            }
            logwriter.writerow(loss_dict)

            if ite % 10 == 0:
                logger.info("iter={};L={};L_c={};L_r={}".format(ite, *loss))

            # save intermediate model
            if ite % save_interval == 0:
                # save DCEC model checkpoints
                logger.info(
                    "saving model to: {}".format(save_dir + "/dcec_model_" +
                                                 str(ite) + ".h5"))
                path = save_dir + "/dcec_model_" + str(ite) + ".h5"
                self.model.save_weights(path)
                gcs_copy(path)
                gcs_copy(logfile_path)
                gcs_copy(overall_log_loss)

            ite += 1

        # save the trained model
        logfile.close()
        l2.close()
        logger.info("saving model to: {}".format(save_dir +
                                                 "/dcec_model_final.h5"))
        self.model.save_weights(save_dir + "/dcec_model_final.h5")
        t3 = time()
        logger.info("Pretrain time:   {}".format(t1 - t0))
        logger.info("Clustering time: {}".format(t3 - t1))
        logger.info("Total time:      {}".format(t3 - t0))

        save_results_to_gcs(save_dir)
Example #16
0
    def fit(self,
            x_train,
            x_val,
            x_test,
            model_name,
            outdir,
            df_columns,
            y=None,
            epoch=500,
            batch_size=256,
            update_interval=5,
            early_stopping=20,
            tol=0.01):

        print('Update interval', update_interval)
        # Step 1: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        encoder_out = self.encoder.predict(x_train)
        y_pred = kmeans.fit_predict(encoder_out)
        # y_pred = kmeans.fit_predict(x_train)
        if y is not None:
            acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            print('kmans: acc = %.5f, nmi = %.5f, ari = %.5f' %
                  (acc, nmi, ari))
            X_embedded = TSNE(n_components=2).fit_transform(encoder_out)
            plt.figure(figsize=(12, 10))
            plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
            plt.colorbar()
            plt.show()
        print(np.bincount(y_pred))
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # for ite in range(int(epoch)):
        #     if ite % update_interval == 0:
        #         q,_,_ = self.model.predict(x_train, verbose=0)
        #         p = self.target_distribution(q)  # update the auxiliary target distribution p
        #     y0 = np.zeros_like(x_train)
        #     self.model.fit(x=x_train, y=[p, y0, x_train], batch_size=batch_size)

        # Step 2: deep clustering
        index = 0
        index_array_train = np.arange(x_train.shape[0])
        index_array_val = np.arange(x_val.shape[0])
        cost_val = []
        cost_train = []
        for ite in range(int(epoch)):
            if ite % update_interval == 0:
                q, _, _ = self.model.predict(x_train, verbose=0)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p
                y_pred = q.argmax(1)
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                print("delta label:{}".format(delta_label))
                y_pred_last = np.copy(y_pred)
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    print('acc = %.5f, nmi = %.5f, ari = %.5f' %
                          (acc, nmi, ari))
                print(np.bincount(y_pred))
                if ite > update_interval and delta_label < tol:
                    # and np.mean(cost_val[-(early_stopping + 1):-1]) > \
                    # np.mean(cost_val[-(early_stopping*2 + 1):-(early_stopping + 1)])\
                    # and np.mean(cost_train[-(early_stopping + 1):-1]) < \
                    # np.mean(cost_train[-(early_stopping*2 + 1):-(early_stopping + 1)]):
                    print("Early stopping...")
                    break

            # train on batch
            tot_train_loss = 0.
            tot_sparse_loss = 0.
            tot_mse_loss = 0.
            tot_cluster_loss = 0.
            while True:
                if index == 0:
                    np.random.shuffle(index_array_train)
                idx = index_array_train[index * batch_size:min(
                    (index + 1) * batch_size, x_train.shape[0])]
                y0 = np.zeros_like(x_train[idx])
                # cluster_loss, sparse_loss, mse_loss = self.model.train_on_batch(x=x_train[idx], y=[p[idx], y0, x_train[idx]])
                loss, cluster_loss, sparse_loss, mse_loss = self.model.train_on_batch(
                    x=x_train[idx], y=[p[idx], y0, x_train[idx]])
                index = index + 1 if (
                    index + 2) * batch_size <= x_train.shape[0] else 0
                tot_train_loss += loss * len(idx)
                tot_cluster_loss += cluster_loss * len(idx)
                tot_mse_loss += mse_loss * len(idx)
                tot_sparse_loss += sparse_loss * len(idx)
                if index == 0:
                    break
            avg_train_loss = tot_train_loss / x_train.shape[0]
            avg_cluster_loss = tot_cluster_loss / x_train.shape[0]
            avg_mse_loss = tot_mse_loss / x_train.shape[0]
            avg_sparse_loss = tot_sparse_loss / x_train.shape[0]
            print(
                "epoch {}th train, train_loss :{:.6f}, cluster_loss: {:.6f}, mse_loss: {:.6f}, sparse_loss: {:.6f}\n"
                .format(ite + 1, avg_train_loss, avg_cluster_loss,
                        avg_mse_loss, avg_sparse_loss))
            cost_train.append(avg_train_loss)
            #
            # tot_val_loss = 0.
            # tot_sparse_loss = 0.
            # tot_mse_loss = 0.
            # tot_cluster_loss = 0.
            # while True:
            #     if index == 0:
            #         np.random.shuffle(index_array_val)
            #     idx = index_array_val[index * batch_size: min((index+1) * batch_size, x_val.shape[0])]
            #     y0 = np.zeros_like(x_val[idx])
            #     loss, cluster_loss, sparse_loss, mse_loss = self.model.test_on_batch(x=x_val[idx], y=[p[idx], y0, x_val[idx]])
            #     index = index + 1 if (index + 2) * batch_size <= x_val.shape[0] else 0
            #     tot_cluster_loss += cluster_loss *len(idx)
            #     tot_mse_loss += mse_loss *len(idx)
            #     tot_sparse_loss += sparse_loss *len(idx)
            #     tot_val_loss += loss * len(idx)
            #     if index==0:
            #         break
            # avg_val_loss = tot_val_loss / x_val.shape[0]
            # avg_cluster_loss = tot_cluster_loss / x_val.shape[0]
            # avg_mse_loss = tot_mse_loss / x_val.shape[0]
            # avg_sparse_loss = tot_sparse_loss / x_val.shape[0]
            # print("epoch {}th validate, loss: {:.6f}, cluster_loss: {:.6f}, mse_loss: {:.6f}, sparse_loss: {:.6f}\n".format(ite + 1,
            #                                                                                      avg_val_loss, avg_cluster_loss,
            #                                                                                      avg_mse_loss,
            #                                                                                      avg_sparse_loss))
            # cost_val.append(avg_val_loss)

        print('training time: ', time() - t1)
        # save the trained model

        print("saving predict data...")
        encoder_out = self.encoder.predict(x_test)
        q, decoder_out, _ = self.model.predict(x_test)
        y_pred = q.argmax(1)
        if y is not None:
            print("orginal cluster proportion: {}".format(np.bincount(y)))
            acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            print('acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari))
            X_embedded = TSNE(n_components=2).fit_transform(encoder_out)
            plt.figure(figsize=(12, 10))
            plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
            plt.colorbar()
            plt.show()
        print(np.bincount(y_pred))
        print(np.bincount(y_pred))

        y_pred = kmeans.fit_predict(encoder_out)
        if y is not None:
            acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            print('kmeans : acc = %.5f, nmi = %.5f, ari = %.5f' %
                  (acc, nmi, ari))
        print(np.bincount(y_pred))

        decoder_sub = decoder_out + x_test
        df = pd.DataFrame(decoder_out, columns=df_columns)
        df_replace = pd.DataFrame(decoder_sub, columns=df_columns)

        outDir = os.path.join(outdir, model_name)
        if os.path.exists(outDir) == False:
            os.makedirs(outDir)
        outPath = os.path.join(outDir,
                               "{}.{}.complete".format(model_name, ite))

        df.to_csv(outPath, index=None, float_format='%.4f')
        df_replace.to_csv(outPath.replace(".complete", ".complete.sub"),
                          index=None,
                          float_format='%.4f')
        pd.DataFrame(encoder_out).to_csv(outPath.replace(
            ".complete", ".encoder.out"),
                                         float_format='%.4f')
        print("saving done!")
Example #17
0
File: DCEC.py Project: ryansar/DCEC
    def fit(self,
            x,
            y=None,
            batch_size=256,
            maxiter=2e4,
            tol=1e-3,
            update_interval=140,
            cae_weights=None,
            save_dir='./results/temp'):

        print('Update interval', update_interval)
        save_interval = x.shape[0] / batch_size * 5
        print('Save interval', save_interval)

        # Step 1: pretrain if necessary
        t0 = time()
        if not self.pretrained and cae_weights is None:
            print('...pretraining CAE using default hyper-parameters:')
            print('   optimizer=\'adam\';   epochs=200')
            self.pretrain(x, batch_size, save_dir=save_dir)
            self.pretrained = True
        elif cae_weights is not None:
            self.cae.load_weights(cae_weights)
            print('cae_weights is loaded successfully.')

        # Step 2: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        self.y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(self.y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # Step 3: deep clustering
        # logging file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(save_dir + '/dcec_log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L', 'Lc', 'Lr'])
        logwriter.writeheader()

        t2 = time()
        loss = [0, 0, 0]
        index = 0
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q, _ = self.model.predict(x, verbose=0)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                self.y_pred = q.argmax(1)
                if y is not None:
                    acc = np.round(metrics.acc(y, self.y_pred), 5)
                    nmi = np.round(metrics.nmi(y, self.y_pred), 5)
                    ari = np.round(metrics.ari(y, self.y_pred), 5)
                    loss = np.round(loss, 5)
                    logdict = dict(iter=ite,
                                   acc=acc,
                                   nmi=nmi,
                                   ari=ari,
                                   L=loss[0],
                                   Lc=loss[1],
                                   Lr=loss[2])
                    logwriter.writerow(logdict)
                    print('Iter', ite, ': Acc', acc, ', nmi', nmi, ', ari',
                          ari, '; loss=', loss)

                # check stop criterion
                delta_label = np.sum(self.y_pred != y_pred_last).astype(
                    np.float32) / self.y_pred.shape[0]
                y_pred_last = np.copy(self.y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    logfile.close()
                    break

            # train on batch
            if (index + 1) * batch_size > x.shape[0]:
                loss = self.model.train_on_batch(
                    x=x[index * batch_size::],
                    y=[p[index * batch_size::], x[index * batch_size::]])
                index = 0
            else:
                loss = self.model.train_on_batch(
                    x=x[index * batch_size:(index + 1) * batch_size],
                    y=[
                        p[index * batch_size:(index + 1) * batch_size],
                        x[index * batch_size:(index + 1) * batch_size]
                    ])
                index += 1

            # save intermediate model
            if ite % save_interval == 0:
                # save DCEC model checkpoints
                print('saving model to:',
                      save_dir + '/dcec_model_' + str(ite) + '.h5')
                self.model.save_weights(save_dir + '/dcec_model_' + str(ite) +
                                        '.h5')

            ite += 1

        # save the trained model
        logfile.close()
        print('saving model to:', save_dir + '/dcec_model_final.h5')
        self.model.save_weights(save_dir + '/dcec_model_final.h5')
        t3 = time()
        print('Pretrain time:  ', t1 - t0)
        print('Clustering time:', t3 - t1)
        print('Total time:     ', t3 - t0)
Example #18
0
    def fit(self,
            trainloader,
            model_name,
            save_inter=200,
            lr=0.001,
            batch_size=128,
            num_epochs=10,
            visualize=False,
            anneal=False):
        use_cuda = torch.cuda.is_available()
        if use_cuda:
            self.cuda()

        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      self.parameters()),
                               lr=lr)

        # # validate
        # self.eval()
        # valid_loss = 0.0
        # for batch_idx, (inputs, _) in enumerate(validloader):
        #     inputs = inputs.view(inputs.size(0), -1).float()
        #     if use_cuda:
        #         inputs = inputs.cuda()
        #     inputs = Variable(inputs)
        #     z, outputs, mu, logvar = self.forward(inputs)
        #
        #     loss = self.loss_function(outputs, inputs, z, mu, logvar)
        #     valid_loss += loss.data[0]*len(inputs)
        #     # total_loss += valid_recon_loss.data[0] * inputs.size()[0]
        #     # total_num += inputs.size()[0]
        #
        # # valid_loss = total_loss / total_num
        # print("#Epoch -1: Valid Loss: %.5f" % (valid_loss / len(validloader.dataset)))
        import csv
        logfile = open('logs/' + model_name + 'cluster_log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['epoch', 'acc', 'nmi', 'ari', 'loss'])
        logwriter.writeheader()

        for epoch in range(num_epochs):
            # train 1 epoch
            self.train()
            if anneal:
                epoch_lr = adjust_learning_rate(lr, optimizer, epoch)
            train_loss = 0.0
            for batch_idx, (inputs, _) in enumerate(trainloader):
                inputs = inputs.view(inputs.size(0), -1).float()
                if use_cuda:
                    inputs = inputs.cuda()
                optimizer.zero_grad()
                inputs = Variable(inputs)

                z, outputs, out_logvar, mu, logvar = self.forward(inputs)
                _, loss = self.loss_function(outputs, out_logvar, inputs, z,
                                             mu, logvar)
                train_loss += loss.data[0] * len(inputs)
                loss.backward()
                optimizer.step()

            # validate
            if epoch % save_inter == 0:
                self.eval()
                valid_loss = 0.0
                total_num = 0
                Y = []
                Y_pred = []
                for batch_idx, (inputs, labels) in enumerate(trainloader):
                    inputs = inputs.view(inputs.size(0), -1).float()
                    if use_cuda:
                        inputs = inputs.cuda()
                    inputs = Variable(inputs)
                    z, outputs, out_logvar, mu, logvar = self.forward(inputs)

                    # loss = self.loss_function(outputs, inputs, z, mu, logvar)
                    # valid_loss += loss.data[0]*len(inputs)
                    # total_loss += valid_recon_loss.data[0] * inputs.size()[0]
                    # total_num += inputs.size()[0]
                    # gamma = self.get_gamma(z, mu, logvar).data.cpu().numpy()
                    gamma, loss = self.loss_function(outputs, out_logvar,
                                                     inputs, z, mu, logvar)
                    valid_loss += loss.data[0] * len(inputs)
                    total_num += len(inputs)
                    Y.append(labels.numpy())
                    Y_pred.append(np.argmax(gamma.data.cpu().numpy(), axis=1))

                valid_loss = valid_loss / total_num
                Y = np.concatenate(Y)
                Y_pred = np.concatenate(Y_pred)
                # valid_loss = total_loss / total_num

                acc = np.round(metrics.acc(Y, Y_pred), 5)
                nmi = np.round(metrics.nmi(Y, Y_pred), 5)
                ari = np.round(metrics.ari(Y, Y_pred), 5)
                loss = np.round(valid_loss, 5)
                logdict = dict(epoch=epoch,
                               acc=acc,
                               nmi=nmi,
                               ari=ari,
                               loss=loss)
                logwriter.writerow(logdict)
                print(
                    'Epoch %d: acc = %.5f, nmi = %.5f, ari = %.5f' %
                    (epoch, acc, nmi, ari), ' ; loss=', loss)

        logfile.close()
Example #19
0
    def cluster(self, maxiter=2e4, update_interval=8192, tol=1e-3):
        """
    Retrieve embeddings from the pretrained RNN model. Then, run clustering over
    the embedding representation
    """
        x = self.dataset.data
        y = self.dataset.labels
        x_aug = self.dataset.aumgented_data
        adam = tf.keras.optimizers.Adam(lr=self.LEARNING_RATE, amsgrad=True)
        self.model.compile(optimizer=adam,
                           loss=['kld', 'mse'],
                           loss_weights=[self.lambda_loss, 1.0])

        save_interval = int(maxiter)
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        features = self.extract_features(x, x_aug)
        y_pred = kmeans.fit_predict(features)
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        logfile = open(self.MODEL_DIR + '/log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss'])
        logwriter.writeheader()

        loss = 0
        for ite in range(int(maxiter)):
            q = self.predict(x, x_aug)
            epoch = (x.shape[0] // self.BATCH_SIZE) * 1
            if ite % epoch == 0:
                p = self.target_distribution(q)

            y_pred = q.argmax(1)
            avg_loss = loss / update_interval
            loss = 0.
            if y is not None:
                acc = np.round(metrics.acc(y, y_pred), 5)
                nmi = np.round(metrics.nmi(y, y_pred), 5)
                ari = np.round(metrics.ari(y, y_pred), 5)
                logdict = dict(iter=ite,
                               acc=acc,
                               nmi=nmi,
                               ari=ari,
                               loss=avg_loss)
                logwriter.writerow(logdict)
                logfile.flush()
                print('Iter %d: acc=%.5f, nmi=%.5f, ari=%.5f; loss=%.5f' %
                      (ite, acc, nmi, ari, avg_loss))

            delta_label = np.sum(y_pred != y_pred_last).astype(
                np.float32) / y_pred.shape[0]
            y_pred_last = np.copy(y_pred)

            idx = np.random.randint(0, x.shape[0], self.BATCH_SIZE)
            ones = np.ones(idx.shape[0], dtype=np.float32)
            loss += self.train_on_batch(x=x[idx],
                                        y=p[idx],
                                        a=x_aug[idx],
                                        r=ones)

        logfile.close()
        print('saving model to:', self.MODEL_DIR + '/model_final.h5')
        self.model.save_weights(self.MODEL_DIR + '/model_final.h5')

        return y_pred
Example #20
0
    # define the model
    model = CAE(input_shape=x.shape[1:], filters=[32, 64, 128, 10])
    plot_model(model, to_file=args.save_dir + '/%s-pretrain-model.png' % args.dataset, show_shapes=True)
    model.summary()

    # compile the model and callbacks
    optimizer = 'adam'
    model.compile(optimizer=optimizer, loss='mse')
    from keras.callbacks import CSVLogger
    csv_logger = CSVLogger(args.save_dir + '/%s-pretrain-log.csv' % args.dataset)

    # begin training
    t0 = time()
    model.fit(x, x, batch_size=args.batch_size, epochs=args.epochs, callbacks=[csv_logger])
    print('Training time: ', time() - t0)
    model.save(args.save_dir + '/%s-pretrain-model-%d.h5' % (args.dataset, args.epochs))

    # extract features
    feature_model = Model(inputs=model.input, outputs=model.get_layer(name='embedding').output)
    features = feature_model.predict(x)
    print('feature shape=', features.shape)

    # use features for clustering
    from sklearn.cluster import KMeans
    km = KMeans(n_clusters=args.n_clusters)

    features = np.reshape(features, newshape=(features.shape[0], -1))
    pred = km.fit_predict(features)
    import metrics
    print('acc=', metrics.acc(y, pred), 'nmi=', metrics.nmi(y, pred), 'ari=', metrics.ari(y, pred))
Example #21
0
    def train_aci_ae(self, x, y=None, maxiter=120e3, batch_size=256, validate_interval=2800, save_interval=2800, save_dir=PATH_RESULT, verbose=1, aug_train=True):
        print('Begin aci_ae training: ', '-' * 60)
        
        #Prepare log file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(save_dir + '/' + self.dataset + '/pretrain/train_aci_ae_log.csv', 'w')
        logwriter = csv.DictWriter(logfile, fieldnames=['iter', 'acc', 'nmi', 'loss_aci_ae', 'loss_disc'])
        logwriter.writeheader()
       
        #Initialization
        t0 = time()
        loss_aci_ae = 0
        loss_disc = 0
        index = 0
        index_array = np.arange(x.shape[0])
        
        #Training loop
        for ite in range(int(maxiter)):
            #Validation interval
            if ite % validate_interval == 0: 
                if y is not None and verbose > 0: 
                    avg_loss_aci_ae = loss_aci_ae / validate_interval 
                    avg_loss_disc = loss_disc / validate_interval   
                    loss_aci_ae = 0. 
                    loss_disc = 0.            
                    features = self.predict_encoder(x)
                    km = KMeans(n_clusters=self.n_clusters, n_init=20)
                    y_pred = km.fit_predict(features)
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    print('Iter %d: acc=%.5f, nmi=%.5f, loss_aci_ae=%.5f, loss_disc=%.5f' % (ite, acc, nmi, avg_loss_aci_ae, avg_loss_disc))
                    logdict = dict(iter=ite, acc=acc, nmi=nmi, loss_aci_ae=avg_loss_aci_ae, loss_disc=avg_loss_disc)
                    logwriter.writerow(logdict)
                    logfile.flush()
            
            #Save interval
            if ite % save_interval == 0:
                self.ae.save_weights(save_dir + '/' + self.dataset + '/pretrain/ae_weights.h5')
                self.critic.save_weights(save_dir + '/' + self.dataset + '/pretrain/critic_weights.h5')

            #Train on batch
            idx = index_array[index * batch_size: min((index+1) * batch_size, x.shape[0])]
            x_batch = x[idx]
            np.random.shuffle(x_batch)
            x_batch = self.random_transform(x_batch, ws=self.ws, hs=self.hs, rot=self.rot, scale=self.scale) if aug_train else x_batch
            alpha_interp = np.random.uniform(low=0.0, high=1.0, size=[x_batch.shape[0]])
            beta_interp = np.random.uniform(low=0.0, high=1.0, size=None)
            beta_interp = 0.5 - np.abs(beta_interp - 0.5)
            
            x_batch_recons = self.predict_ae(x_batch)
            x_batch_recons_interp = self.predict_i_ae(x_batch, alpha_interp)
            x_batch_mix = np.multiply(beta_interp, x_batch) + np.multiply(1 - beta_interp, x_batch_recons)

            loss1 = self.train_on_batch_aci_ae(x_batch, alpha_interp)
            loss2 = self.train_on_batch_disc(x_batch_recons_interp, x_batch_mix, alpha_interp, np.zeros((x_batch.shape[0],)))

            loss_aci_ae = loss_aci_ae + loss1
            loss_disc = loss_disc + loss2
            index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

        logfile.close()
        print('training time: ', time() - t0)
        self.ae.save_weights(save_dir + '/' + self.dataset + '/pretrain/ae_weights.h5')
        self.critic.save_weights(save_dir + '/' + self.dataset + '/pretrain/critic_weights.h5')
        print('trained weights are saved to %s/%s/pretrain/ae_weights.h5' % (save_dir, self.dataset))
        print('trained weights are saved to %s/%s/critic_weights.h5' % (save_dir, self.dataset))
        print('training: ', '-' * 60)
Example #22
0
    def train_dynAE(self, x, y=None, kappa=3, n_clusters=10, maxiter=1e5, batch_size=256, tol=1e-2, validate_interval=140, show_interval=None, save_interval=2800, save_dir=PATH_RESULT, aug_train=True):
        #init
        number_of_samples = x.shape[0]
        img_h = int(math.sqrt(x.shape[1]))
        img_w = int(math.sqrt(x.shape[1]))

        #logging file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(save_dir + '/' + self.dataset + '/cluster/train_dynAE_gamma=' + str(self.gamma) + '_log.csv', 'w')
        #logwriter = csv.DictWriter(logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'acc_unconf', 'nmi_unconf', 'acc_conf', 'nmi_conf', 'nb_unconf', 'nb_conf', 'fr', 'fd', 'loss'])
        logwriter = csv.DictWriter(logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'acc_unconf', 'nmi_unconf', 'acc_conf', 'nmi_conf', 'nb_unconf', 'nb_conf', 'loss'])
        logwriter.writeheader()

        #intervals config
        print('Begin clustering:', '-' * 60)
        if save_interval is None: 
            save_interval = int(maxiter)  # only save the initial and final model
        print('Save interval ', save_interval)
        if show_interval is None:
            show_interval = int(np.ceil(number_of_samples/batch_size))*20
        print('show interval ', show_interval)  

        # Step 1: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        centers_emb, centers_img, y_pred, _ = self.generate_centers(x, n_clusters)

        # Step 2: beta1 and beta2
        beta1, beta2 = self.generate_beta(kappa, n_clusters)

        # Step 3: deep clustering
        loss = 0
        index = 0
        nb_conf_prev = x.shape[0]
        index_array = np.arange(x.shape[0])
        delta_kappa = 0.3 * kappa
        sess = tf.keras.backend.get_session()
        for ite in range(int(maxiter)):
            if ite % validate_interval == 0:
                x_emb = self.encoder.predict(x)
                q = q_mat(x_emb, centers_emb)
                y_pred = q.argmax(1) 
                avg_loss = loss / validate_interval
                loss = 0.
                if ite > 0:
                    nb_conf_prev = nb_conf 
                nb_unconf, nb_conf = self.compute_nb_conflicted_data(x, centers_emb, beta1, beta2)
                #update centers
                if nb_conf >= nb_conf_prev:
                    centers_emb, centers_img, _, _ = self.generate_centers(x, n_clusters)
                    print("update centers")
                    beta1 = beta1 - (delta_kappa / n_clusters)
                    beta2 = beta2 - (delta_kappa / n_clusters)
                    delta_kappa = 0.3 * kappa
                    kappa = delta_kappa
                    print("update confidences")

                if y is not None:
                    y_mapped = map_vector_to_clusters(y, y_pred)
                    x_emb = self.predict_encoder(x)
                    y_encoder, y_autoencoder = generate_supervisory_signals(x_emb, x, centers_emb, centers_img, beta1, beta2)
                    y_encoder_true = centers_emb[y_mapped]
                    #grad_loss_dynAE = sess.run(self.grad_loss_dynAE, feed_dict={'input_dynAE:0': x, 'target1_dynAE:0': y_encoder, 'target2_dynAE:0': y_autoencoder})
                    #grad_loss_pseudo_supervised = sess.run(self.grad_loss_pseudo_supervised, feed_dict={'input_dynAE:0': x, 'target1_dynAE:0': y_encoder})
                    #grad_loss_self_supervised = sess.run(self.grad_loss_self_supervised, feed_dict={'input_dynAE:0': x, 'target2_dynAE:0': y_autoencoder})
                    #grad_loss_supervised = sess.run(self.grad_loss_supervised, feed_dict={'input_dynAE:0': x, 'target3_dynAE:0': y_encoder_true})
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5) 
                    #fr = np.round(metrics.cos_grad(grad_loss_supervised, grad_loss_dynAE), 5)
                    #fd = np.round(metrics.cos_grad(grad_loss_self_supervised, grad_loss_pseudo_supervised), 5)
                    acc_unconf, nmi_unconf, acc_conf, nmi_conf = self.compute_acc_and_nmi_conflicted_data(x, y, centers_emb, beta1, beta2)
                    #logdict = dict(iter=ite, acc=acc, nmi=nmi, ari=ari, acc_unconf=acc_unconf, nmi_unconf=nmi_unconf, acc_conf=acc_conf, nmi_conf=nmi_conf, nb_unconf=nb_unconf, nb_conf=nb_conf, fr=fr, fd=fd, loss=avg_loss)
                    logdict = dict(iter=ite, acc=acc, nmi=nmi, ari=ari, acc_unconf=acc_unconf, nmi_unconf=nmi_unconf, acc_conf=acc_conf, nmi_conf=nmi_conf, nb_unconf=nb_unconf, nb_conf=nb_conf, loss=avg_loss)
                    logwriter.writerow(logdict)
                    logfile.flush()
                    #print('Iter %d: acc=%.5f, nmi=%.5f, ari=%.5f, acc_unconf=%.5f, nmi_unconf=%.5f, acc_conf=%.5f, nmi_conf=%.5f, nb_unconf=%d, nb_conf=%d, fr=%.5f, fd=%.5f, loss=%.5f' % (ite, acc, nmi, ari, acc_unconf, nmi_unconf, acc_conf, nmi_conf, nb_unconf, nb_conf, fr, fd, avg_loss))
                    print('Iter %d: acc=%.5f, nmi=%.5f, ari=%.5f, acc_unconf=%.5f, nmi_unconf=%.5f, acc_conf=%.5f, nmi_conf=%.5f, nb_unconf=%d, nb_conf=%d, loss=%.5f' % (ite, acc, nmi, ari, acc_unconf, nmi_unconf, acc_conf, nmi_conf, nb_unconf, nb_conf, avg_loss))
                    print("The number of unconflicted data points is : " + str(nb_unconf))
                    print("The number of conflicted data points is : " + str(nb_conf))
                if(nb_conf / x.shape[0]) < tol:
                    logfile.close()
                    break

            if ite % show_interval == 0 and ite!=0:
                print("")
                print("----------------------------------------------------------------------------------")
                print("Centroids : ")
                print("----------------------------------------------------------------------------------")
                draw_centers(n_clusters, centers_img, img_h=img_h, img_w=img_w)
                
            # save intermediate model
            if ite % save_interval == 0:
                print("")
                print("----------------------------------------------------------------------------------")
                print("Save embeddings for visualization : ")
                print("----------------------------------------------------------------------------------")
                z = self.predict_encoder(x)
                q1 = q_mat(z, centers_emb)
                y1_pred = q1.argmax(1)

                pca = PCA(n_components=2).fit(z)
                z_2d = pca.transform(z)
                centers_2d = pca.transform(centers_emb)

                # save states for visualization
                np.save(self.visualisation_dir + '/embeddings/' + self.dataset + '/vis_' + str(ite) + '.npy', {'z_2d': z_2d, 'centers_2d': centers_2d, 'y_pred': y1_pred})

                print('saving model to: ', save_dir + '/' + self.dataset + '/cluster/ae_' + str(ite) + '_weights.h5')
                self.ae.save_weights(save_dir + '/' + self.dataset + '/cluster/ae_' + str(ite) + '_weights.h5')

            # train on batch
            idx = index_array[index * batch_size: min((index+1) * batch_size, x.shape[0])]

            X_img = x[idx]
            X_emb = self.predict_encoder(X_img)

            Y_encoder, Y_autoencoder = generate_supervisory_signals(X_emb, X_img, centers_emb, centers_img, beta1, beta2)
            X_transformed = self.random_transform(X_img, ws=self.ws, hs=self.hs, rot=self.rot, scale=self.scale) if aug_train else X_img

            losses = self.train_on_batch_dynAE(X_transformed, Y_encoder, Y_autoencoder)
            loss = loss + losses
            index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

        # save the trained model
        logfile.close()
        print('saving model to:', save_dir + '/' + self.dataset + '/cluster/ae_weights.h5')
        self.ae.save_weights(save_dir + '/' + self.dataset + '/cluster/ae_weights.h5')
        print('Clustering time: %ds' % (time() - t1))
        print('End clustering:', '-' * 60)

        return y_pred
Example #23
0
 def test_ari(self):
     ari = metrics.ari(self.true, self.pred)
     self.assertAlmostEqual(ari, 0.137880987)
Example #24
0
index_array = np.arange(x.shape[0])

tol = 0.001  # tolerance threshold to stop training

for ite in range(int(maxiter)):
    if ite % update_interval == 0:
        q = model.predict(x, verbose=0)
        p = target_distribution(
            q)  # update the auxiliary target distribution p

        # evaluate the clustering performance
        y_pred = q.argmax(1)
        if y is not None:
            acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            loss = np.round(loss, 5)
            print(
                'Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' %
                (ite, acc, nmi, ari), ' ; loss=', loss)

        # check stop criterion - model convergence
        delta_label = np.sum(y_pred != y_pred_last).astype(
            np.float32) / y_pred.shape[0]
        y_pred_last = np.copy(y_pred)
        if ite > 0 and delta_label < tol:
            print('delta_label ', delta_label, '< tol ', tol)
            print('Reached tolerance threshold. Stopping training.')
            break
    idx = index_array[index * batch_size:min((index + 1) *
                                             batch_size, x.shape[0])]
Example #25
0
File: DCEC.py Project: ryansar/DCEC
    elif args.dataset == 'usps':
        x, y = load_usps('data/usps')
    elif args.dataset == 'mnist-test':
        x, y = load_mnist()
        x, y = x[60000:], y[60000:]

    # prepare the DCEC model
    dcec = DCEC(input_shape=x.shape[1:],
                filters=[32, 64, 128, 10],
                n_clusters=args.n_clusters)
    plot_model(dcec.model,
               to_file=args.save_dir + '/dcec_model.png',
               show_shapes=True)
    dcec.model.summary()

    # begin clustering.
    optimizer = 'adam'
    dcec.compile(loss=['kld', 'mse'],
                 loss_weights=[args.gamma, 1],
                 optimizer=optimizer)
    dcec.fit(x,
             y=y,
             tol=args.tol,
             maxiter=args.maxiter,
             update_interval=args.update_interval,
             save_dir=args.save_dir,
             cae_weights=args.cae_weights)
    y_pred = dcec.y_pred
    print('acc = %.4f, nmi = %.4f, ari = %.4f' % (metrics.acc(
        y, y_pred), metrics.nmi(y, y_pred), metrics.ari(y, y_pred)))
Example #26
0
    if not isinstance(label, (int, float)):
        label = LabelEncoder().fit_transform(label)
    n_clusters = len(np.unique(label))
    for out in out_list:
        df = pd.read_csv(
            "F:/project/autoencoder-rmd/{}/h_{}/h_{}/h_{}.{}".format(
                run_type, name, name, name, out))
        X = df.values
        for method in cluster_method:
            if method == 'tsne':
                X = PCA(n_components=50).fit_transform(X)
                X_embedded = TSNE(n_components=2).fit_transform(X)
                y_pred = KMeans(n_clusters=n_clusters,
                                n_init=40).fit_predict(X_embedded)
            else:
                X_embedded = PCA(n_components=n_clusters).fit_transform(X)
                y_pred = KMeans(n_clusters=n_clusters,
                                n_init=40).fit_predict(X_embedded)
            acc = np.round(metrics.acc(label, y_pred), 5)
            nmi = np.round(metrics.nmi(label, y_pred), 5)
            ari = np.round(metrics.ari(label, y_pred), 5)
            logdict = dict(name=name,
                           out=out,
                           method=method,
                           acc=acc,
                           nmi=nmi,
                           ari=ari)
            logwriter.writerow(logdict)

logfile.close()
Example #27
0
    tol = 0.001  # tolerance threshold to stop training

    for ite in range(int(maxiter)):
        print(".", end="")  # print . without newline
        if ite % update_interval == 0:
            q = new_model.predict(x_train, verbose=1)
            p = target_distribution(
                q)  # update the auxiliary target distribution p

            # evaluate the clustering performance
            y_pred = q.argmax(1)

            if y_train is not None:
                acc = np.round(metrics.acc(y_train, y_pred), 5)
                nmi = np.round(metrics.nmi(y_train, y_pred), 5)
                ari = np.round(metrics.ari(y_train, y_pred), 5)
                loss = np.round(loss, 5)
                print('Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' \
                    % (ite, acc, nmi, ari), ' ; loss=', loss)

            # check stop criterion - model convergence
            delta_label = np.sum(y_pred != y_pred_last).astype(
                np.float32) / y_pred.shape[0]
            y_pred_last = np.copy(y_pred)
            if ite > 0:
                print('delta_label=', delta_label, ' and tol=', tol)

            if ite > 0 and delta_label < tol:
                print('delta_label ', delta_label, '< tol ', tol)
                print('Reached tolerance threshold. Stopping training.')
                break
Example #28
0
    def fit(self,
            x,
            y=None,
            batch_size=32,
            maxiter=2e4,
            tol=1e-3,
            update_interval=140,
            cae_weights=None):

        save_interval = x.shape[0] / batch_size * 5

        # Step 1: pretrain if necessary
        if not self.pretrained and cae_weights is None:
            self.pretrain(x, batch_size)
            self.pretrained = True

        # Step 2: initialize cluster centers using k-means
        print('Initializing cluster centers with k-means or gmm.')

        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        self.y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(self.y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # You can also want to initialize the weights with Gaussian Mixture Model

        #gmm = mixture.GaussianMixture(n_components=self.n_clusters, covariance_type="full")
        #gmm = mixture.GaussianMixture(n_components=self.n_clusters, n_init=20)
        #self.y_pred = gmm.fit_predict(self.encoder.predict(x))
        #y_pred_last = np.copy(self.y_pred)
        #self.model.get_layer(name='clustering').set_weights([gmm.means_])

        # Step 3: deep clustering
        loss = [0, 0, 0]
        index = 0
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q, _ = self.model.predict(x, verbose=0)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                self.y_pred = q.argmax(1)
                if y is not None:
                    acc = np.round(metrics.acc(y, self.y_pred), 5)
                    nmi = np.round(metrics.nmi(y, self.y_pred), 5)
                    ari = np.round(metrics.ari(y, self.y_pred), 5)
                    loss = np.round(loss, 5)

                # check stop criterion
                delta_label = np.sum(self.y_pred != y_pred_last).astype(
                    np.float32) / self.y_pred.shape[0]
                y_pred_last = np.copy(self.y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    break

            # train on batch
            if (index + 1) * batch_size > x.shape[0]:
                loss = self.model.train_on_batch(
                    x=x[index * batch_size::],
                    y=[p[index * batch_size::], x[index * batch_size::]])
                index = 0
            else:
                loss = self.model.train_on_batch(
                    x=x[index * batch_size:(index + 1) * batch_size],
                    y=[
                        p[index * batch_size:(index + 1) * batch_size],
                        x[index * batch_size:(index + 1) * batch_size]
                    ])
                index += 1
            ite += 1
Example #29
0
    def fit(self,
            x,
            y=None,
            maxiter=2e4,
            batch_size=256,
            tol=1e-3,
            update_interval=140,
            save_dir='./results/temp'):

        print('Update interval', update_interval)
        save_interval = int(x.shape[0] / batch_size) * 5  # 5 epochs
        print('Save interval', save_interval)

        # Step 1: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # Step 2: deep clustering
        # logging file
        import csv
        logfile = open(save_dir + '/dec_log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss'])
        logwriter.writeheader()

        loss = 0
        index = 0
        index_array = np.arange(x.shape[0])
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q = self.model.predict(x, verbose=0)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                y_pred = q.argmax(1)
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    loss = np.round(loss, 5)
                    logdict = dict(iter=ite,
                                   acc=acc,
                                   nmi=nmi,
                                   ari=ari,
                                   loss=loss)
                    logwriter.writerow(logdict)
                    print(
                        'Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' %
                        (ite, acc, nmi, ari), ' ; loss=', loss)

                # check stop criterion
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                y_pred_last = np.copy(y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    logfile.close()
                    break

            # train on batch
            # if index == 0:
            #     np.random.shuffle(index_array)
            idx = index_array[index * batch_size:min((index + 1) *
                                                     batch_size, x.shape[0])]
            loss = self.model.train_on_batch(x=x[idx], y=p[idx])
            index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

            # save intermediate model
            if ite % save_interval == 0:
                print('saving model to:',
                      save_dir + '/DEC_model_' + str(ite) + '.h5')
                self.model.save_weights(save_dir + '/DEC_model_' + str(ite) +
                                        '.h5')

            ite += 1

        # save the trained model
        logfile.close()
        print('saving model to:', save_dir + '/DEC_model_final.h5')
        self.model.save_weights(save_dir + '/DEC_model_final.h5')

        return y_pred
Example #30
0
    print(args)

    import os
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    # load dataset
    from datasets import load_mnist, load_usps
    if args.dataset == 'mnist':
        x, y = load_mnist()
    elif args.dataset == 'usps':
        x, y = load_usps('data/usps')
    elif args.dataset == 'mnist-test':
        x, y = load_mnist()
        x, y = x[60000:], y[60000:]

    # prepare the DCEC model
    dcec = DCEC(input_shape=x.shape[1:], filters=[32, 64, 128, 10], n_clusters=args.n_clusters)
    plot_model(dcec.model, to_file=args.save_dir + '/dcec_model.png', show_shapes=True)
    dcec.model.summary()

    # begin clustering.
    optimizer = 'adam'
    dcec.compile(loss=['kld', 'mse'], loss_weights=[args.gamma, 1], optimizer=optimizer)
    dcec.fit(x, y=y, tol=args.tol, maxiter=args.maxiter,
             update_interval=args.update_interval,
             save_dir=args.save_dir,
             cae_weights=args.cae_weights)
    y_pred = dcec.y_pred
    print('acc = %.4f, nmi = %.4f, ari = %.4f' % (metrics.acc(y, y_pred), metrics.nmi(y, y_pred), metrics.ari(y, y_pred)))