Ejemplo n.º 1
0
    def compute_acc_and_nmi_conflicted_data(self, x, y, centers_emb, beta1, beta2):
        features = self.predict_encoder(x)
        unconf_indices, conf_indices = self.generate_unconflicted_data_index(x, centers_emb, beta1, beta2)
        
        if unconf_indices.size == 0:
            print(' '*8 + "Empty list of unconflicted data")
            acc_unconf = 0
            nmi_unconf = 0
        else:
            x_emb_unconf = self.predict_encoder(x[unconf_indices])
            y_unconf = y[unconf_indices]
            y_pred_unconf = q_mat(x_emb_unconf, centers_emb, alpha=1.0).argmax(axis=1)
            acc_unconf = metrics.acc(y_unconf, y_pred_unconf)
            nmi_unconf = metrics.nmi(y_unconf, y_pred_unconf)
            print(' '*8 + '|==>  acc unconflicted data: %.4f,  nmi unconflicted data: %.4f  <==|'% (acc_unconf, nmi_unconf))

        if conf_indices.size == 0:
            print(' '*8 + "Empty list of conflicted data")
            acc_conf = 0
            nmi_conf = 0
        else:
            x_emb_conf = self.predict_encoder(x[conf_indices])
            y_conf = y[conf_indices]
            y_pred_conf = q_mat(x_emb_conf, centers_emb, alpha=1.0).argmax(axis=1)
            acc_conf = metrics.acc(y_conf, y_pred_conf)
            nmi_conf = metrics.nmi(y_conf, y_pred_conf)
            print(' '*8 + '|==>  acc conflicted data: %.4f,  nmi conflicted data: %.4f  <==|'% (metrics.acc(y_conf, y_pred_conf), metrics.nmi(y_conf, y_pred_conf)))    
        return acc_unconf, nmi_unconf, acc_conf, nmi_conf
Ejemplo n.º 2
0
 def metric(self, y, y_pred):
     acc = np.round(metrics.acc(y, y_pred), 5)
     nmi = np.round(metrics.nmi(y, y_pred), 5)
     ari = np.round(metrics.ari(y, y_pred), 5)
     print('acc:', acc)
     print('nmi:', nmi)
     print('ari:', ari)
Ejemplo n.º 3
0
def match(y,cl):
    cl=np.array(cl)
    y=np.array(y)
    acc = np.round(metrics.acc(y, cl), 5)
    nmi = np.round(metrics.nmi(y, cl), 5)
    ari = np.round(metrics.ari(y, cl), 5)
    return acc,nmi,ari
Ejemplo n.º 4
0
    def epochBegin(self, epoch):
        if epoch % self.decay_n == 0 and epoch != 0:
            self.lr_decay()

        gamma = self.gamma_output.predict(self.inputs, batch_size=batch_size)
        pred = np.argmax(gamma, axis=1)
        acc = self.cluster_acc(pred, self.Y)

        Y = np.reshape(self.Y, [self.Y.shape[0]])
        nmi = metrics.nmi(Y, pred)
        ari = metrics.ari(Y, pred)
        purity = self.purity_score(Y, pred)
        global accuracy
        accuracy = []
        accuracy += [acc[0]]
        if epoch > 0:
            print('ACC:%0.8f' % acc[0])
            print('NMI:', nmi)
            print('ARI:', ari)
            print('Purity', purity)
        if epoch == 1 and dataset == 'har' and acc[0] < 0.77:
            print(
                '=========== HAR dataset:bad init!Please run again! ============'
            )
            sys.exit(0)
Ejemplo n.º 5
0
    def fit(self,
            x,
            y=None,
            maxiter=2e4,
            batch_size=256,
            tol=1e-3,
            update_interval=140,
            save_dir='./results/temp',
            rand_seed=None):

        print('Update interval', update_interval)
        save_interval = int(x.shape[0] / batch_size) * 5  # 5 epochs
        print('Save interval', save_interval)

        # Step 1: initialize cluster centers using k-means
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=100)
        y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        loss = 0
        index = 0
        index_array = np.arange(x.shape[0])
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q = self.model.predict(x, verbose=0)
                p = self.target_distribution(q)

                y_pred = q.argmax(1)
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    loss = np.round(loss, 5)
                    print('Iter %d: acc = %.5f, nmi = %.5f' % (ite, acc, nmi),
                          ' ; loss=', loss)

                # check stop criterion
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                y_pred_last = np.copy(y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    break

            idx = index_array[index * batch_size:min((index + 1) *
                                                     batch_size, x.shape[0])]
            loss = self.model.train_on_batch(x=x[idx], y=p[idx])
            index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

            ite += 1

        # save the trained model
        print('saving model to:', save_dir + 'STC_model_final.h5')
        self.model.save_weights(save_dir + 'STC_model_final.h5')

        return y_pred
Ejemplo n.º 6
0
def train(args):
    # get data and model
    (x, y), model = _get_data_and_model(args)

    # split train validation data
    if y is None:
        x_train, x_val = train_test_split(x, test_size=0.1)
        y_val = None
        y_train = None
    else:
        x_train, x_val, y_train, y_val = train_test_split(x,
                                                          y,
                                                          stratify=y,
                                                          test_size=0.1)

    model.model.summary()

    # pretraining
    t0 = time()
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    if args.pretrained_weights is not None and os.path.exists(
            args.pretrained_weights):  # load pretrained weights
        model.autoencoder.load_weights(args.pretrained_weights)
    else:  # train
        pretrain_optimizer = SGD(1.0, 0.9) if args.method in [
            'FcDEC', 'FcIDEC', 'FcDEC-DA', 'FcIDEC-DA'
        ] else 'adam'
        model.pretrain(x_train,
                       y_train,
                       x_val,
                       y_val,
                       optimizer=pretrain_optimizer,
                       epochs=args.pretrain_epochs,
                       batch_size=args.batch_size,
                       save_dir=args.save_dir,
                       verbose=args.verbose,
                       aug_pretrain=args.aug_pretrain)
    t1 = time()
    print("Time for pretraining: %ds" % (t1 - t0))

    # clustering
    y_pred = model.fit(x,
                       y,
                       maxiter=args.maxiter,
                       batch_size=args.batch_size,
                       update_interval=args.update_interval,
                       save_dir=args.save_dir,
                       aug_cluster=args.aug_cluster)
    if y is not None:
        print('Final: acc=%.4f, nmi=%.4f, ari=%.4f' % (metrics.acc(
            y, y_pred), metrics.nmi(y, y_pred), metrics.ari(y, y_pred)))
    t2 = time()
    print("Time for pretaining, clustering and total: (%ds, %ds, %ds)" %
          (t1 - t0, t2 - t1, t2 - t0))
    print('=' * 60)
Ejemplo n.º 7
0
 def on_epoch_end(self, epoch, logs=None):
     if int(epochs / 10) != 0 and epoch % int(epochs / 10) != 0:
         return
     feature_model = tf.keras.models.Model(self.model.input,
                                           self.model.get_layer('encoder_3').output)
     features = feature_model.predict(self.x)
     km = KMeans(n_clusters=len(np.unique(self.y)), n_init=20, n_jobs=4)
     y_pred = km.fit_predict(features)
     # print()
     print(' ' * 8 + '|==>  acc: %.4f,  nmi: %.4f  <==|'
           % (metrics.acc(self.y, y_pred), metrics.nmi(self.y, y_pred)))
Ejemplo n.º 8
0
def test(args):
    assert args.weights is not None
    (x, y), model = _get_data_and_model(args)
    model.model.summary()

    print('Begin testing:', '-' * 60)
    model.load_weights(args.weights)
    y_pred = model.predict_labels(x)
    print('acc=%.4f, nmi=%.4f, ari=%.4f' % (metrics.acc(
        y, y_pred), metrics.nmi(y, y_pred), metrics.ari(y, y_pred)))
    print('End testing:', '-' * 60)
Ejemplo n.º 9
0
def kmeans_():

    # use features for clustering
    from sklearn.cluster import KMeans
    km = KMeans(n_clusters=N, init='k-means++')
    #features = np.reshape(x_train, newshape=(features.shape[0], -1))
    km_trans = km.fit_transform(x_train)
    pred = km.predict(x_train)
    print pred.shape
    print('acc=', met.acc(y_train, pred), 'nmi=', met.nmi(y_train,
                                                          pred), 'ari=',
          met.ari(y_train, pred))
    return km_trans, pred
Ejemplo n.º 10
0
    def fit(self, x, y=None, save_dir='./results/temp'):
        # print('Begin training:', '-' * 60)

        t1 = time()
        print(
            '******************** Use Denpeak to Cluster ************************'
        )

        features = self.encoder.predict(x)
        print("features shape:", features.shape)
        features = TSNE(n_components=2).fit_transform(features)
        # np.savetxt("features.txt", features)
        print("features shape:", features.shape)
        y_pred, y_border, center_num, dc_percent, dc = DenPeakCluster(features)
        print('saving picture to:', save_dir + '/2D.png')
        plt.cla()
        plt.scatter(features[:, 0], features[:, 1], c=y_pred, s=0.5, alpha=0.5)
        plt.savefig(save_dir + '/2D.png')
        np.savetxt(save_dir + '/dc_coeff.txt', [dc_percent, dc])

        # logging file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(save_dir + '/log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile,
            fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss', 'center_num'])
        logwriter.writeheader()

        acc = np.round(metrics.acc(y, y_pred), 5)
        nmi = np.round(metrics.nmi(y, y_pred), 5)
        ari = np.round(metrics.ari(y, y_pred), 5)
        # if acc>=0.95:
        np.savetxt(save_dir + '/features.txt', features)
        np.savetxt(save_dir + '/labels.txt', y_pred)
        np.savetxt(save_dir + '/border.txt', y_border)
        from Draw_border import draw
        draw(save_dir)
        logdict = dict(iter=0,
                       acc=acc,
                       nmi=nmi,
                       ari=ari,
                       center_num=center_num)
        logwriter.writerow(logdict)
        logfile.flush()
        print('Iter %d: acc=%.5f, nmi=%.5f, ari=%.5f; center_num=%d' %
              (0, acc, nmi, ari, center_num))
        logfile.close()

        return y_pred
Ejemplo n.º 11
0
            def on_epoch_end(self, epoch, logs=None):
                if int(epochs / 10) != 0 and epoch % int(epochs / 10) != 0:
                    return
                feature_model = Model(
                    self.model.input,
                    self.model.get_layer(
                        'encoder_%d' %
                        (int(len(self.model.layers) / 2) - 1)).output)
                features = feature_model.predict(self.x)
                km = KMeans(n_clusters=nclusters, n_init=20)
                y_pred = km.fit_predict(features)

                if self.y:
                    acc, nmi = metrics.acc(self.y, y_pred), metrics.nmi(
                        self.y, y_pred)
                    print(' ' * 8 + '|==>  acc: %.4f,  nmi: %.4f  <==|' %
                          (acc, nmi))
                else:
                    if not self.lastpred is None:
                        nmi = metrics.nmi(self.lastpred, y_pred)
                        print(' ' * 8 + '|==> nmi: %.4f  <==|' % (nmi))

                self.lastpred = y_pred
Ejemplo n.º 12
0
def train_feature(net1, train_data):
    map_dict = read_pkl()
    if torch.cuda.is_available():
        net1 = torch.nn.DataParallel(net1, device_ids=[0])
        net1 = net1.cuda()
    prev_time = datetime.now()

    for i_dir in range(classnum):
        if not os.path.isdir('./data/' + str(i_dir)):
            os.makedirs('./data/' + str(i_dir))
    label_np = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0] * 10).reshape(10, 10)
    # label_np = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    #                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0] * 20).reshape(20, 20)

    label2 = []
    idx2 = []
    for im, label in tqdm(train_data, desc="Processing train data: "):
        im = im.cuda()
        feat = net1(im)
        for i in range(feat.size(0)):
            distance_list = list()
            for ui_50D_label in map_dict.values():
                distance = sum(sum((ui_50D_label.float().cuda() - feat[i])**2))
                distance_list.append(distance.item())
            idx = distance_list.index(min(distance_list))
            save_image(
                inver_transform2(im[i]), './data/' + str(idx) + '/' +
                str(random.randint(1, 10000000)) + '.png')
            label_np[idx][label[i].item()] += 1
            label2.append(idx)
        label1 = label.numpy()
        # for _,i in enumerate(label):
        #     idx2.append(i)
        for i in label1:
            idx2.append(i)

    t2 = np.array(idx2)
    t1 = np.array(label2)
    # print(t2.shape)
    # t2 = t2.reshape([t1.size,-1]).squeeze(0)
    print('acc=%.4f, nmi=%.4f, ari=%.4f' %
          (metrics.acc(t1, t2), metrics.nmi(t1, t2), metrics.ari(t1, t2)))

    corr_num = 0
    for item in label_np:
        corr_num += item.max()
    corr = corr_num / label_np.sum()
    print(corr)
    np.save('./model/MNIST/feature/' + str(feat.size(1)) + '_' + '.npy',
            label_np)
Ejemplo n.º 13
0
    def gmm_kmeans_cluster(self, dataloader):
        use_cuda = torch.cuda.is_available()
        if use_cuda:
            self.cuda()

        self.eval()
        data = []
        Y = []
        for batch_idx, (inputs, y) in enumerate(dataloader):
            inputs = inputs.view(inputs.size(0), -1).float()
            if use_cuda:
                inputs = inputs.cuda()
            inputs = Variable(inputs)
            _, _, _, mu, _ = self.forward(inputs)
            data.append(mu.data.cpu().numpy())
            Y.append(y.numpy())
        data = np.concatenate(data)
        Y = np.concatenate(Y)
        gmm = GaussianMixture(n_components=self.n_centroids,
                              covariance_type='full')
        gmm.fit(data)
        y_pred_gmm = gmm.predict(data)
        acc = np.round(metrics.acc(Y, y_pred_gmm), 5)
        nmi = np.round(metrics.nmi(Y, y_pred_gmm), 5)
        ari = np.round(metrics.ari(Y, y_pred_gmm), 5)
        print(
            'GMM fit of AutoEncoder embedding: acc = %.5f, nmi = %.5f, ari = %.5f'
            % (acc, nmi, ari))

        km = KMeans(n_clusters=self.n_centroids, n_init=20)
        y_pred_kmeans = km.fit_predict(data)
        acc = np.round(metrics.acc(Y, y_pred_kmeans), 5)
        nmi = np.round(metrics.nmi(Y, y_pred_kmeans), 5)
        ari = np.round(metrics.ari(Y, y_pred_kmeans), 5)
        print(
            'Kmeans clustering of AutoEncoder embedding: acc = %.5f, nmi = %.5f, ari = %.5f'
            % (acc, nmi, ari))
Ejemplo n.º 14
0
 def on_epoch_end(self, epoch, logs=None):
     if epochs < 10 or epoch % int(epochs / 10) != 0:
         return
     feature_model = Model(
         self.model.input,
         self.model.get_layer(
             index=int(len(self.model.layers) / 2)).output)
     features = feature_model.predict(self.x)
     km = KMeans(n_clusters=len(np.unique(self.y)),
                 n_init=20,
                 n_jobs=4)
     y_pred = km.fit_predict(features)
     print(' ' * 8 + '|==>  acc: %.4f,  nmi: %.4f  <==|' %
           (metrics.acc(self.y, y_pred),
            metrics.nmi(self.y, y_pred)))
Ejemplo n.º 15
0
def main():
    x = np.load('./data/chan/all/VGG/featuresx.npy')

    init = 'glorot_uniform'

    # prepare the DEC model
    silhouette_avgs = []
    nims = []
    rel_loss = []
    prev = None
    for n_clusters in [5, 10, 15, 20]:
        weights = './results/models/chan/all/VGG/DEC_model_final_%s.h5' % n_clusters
        dec = DEC(dims=[x.shape[-1], 500, 500, 2000, 10],
                  n_clusters=n_clusters,
                  init=init)
        dec.model.load_weights(weights)

        q = dec.model.predict(x, verbose=0)
        y_pred = q.argmax(1)
        if not prev is None:
            nmi_ = np.round(metrics.nmi(prev, y_pred), 5)
            nims.append((nmi_))
            print(
                '\n |==> NMI against previous assignment: {0:.3f} <==|'.format(
                    nmi_))
        prev = y_pred

        silhouette_avg = silhouette_score(x, y_pred)
        silhouette_avgs.append(silhouette_avg)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
        '''tr_loss = dec.model.evaluate(x)
        ts_loss = dec.model.evaluate(x_test)
        rel_loss.append(tr_loss/ts_loss)
        print('\n |==> relative loss: {0:.4f} <==|'.format(tr_loss/ts_loss))'''

    plt.plot(range(len(nims)), nims)
    plt.show()

    plt.plot(range(len(silhouette_avgs)), silhouette_avgs)
    plt.show()
Ejemplo n.º 16
0
def test(net1, test_data):
    #
    if torch.cuda.is_available():
        net1 = torch.nn.DataParallel(net1, device_ids=[0])
        net1 = net1.cuda()
    #
    label2 = []
    idx2 = []
    for im, label in tqdm(test_data, desc="Processing train data: "):
        im = im.cuda()
        _, feat = net1(im)
        for i in range(feat.size(0)):
            distance = feat[i].cpu().numpy().tolist()
            idx = distance.index(max(distance))
            label2.append(idx)
        label1 = label.numpy()
        for i in label1:
            idx2.append(i)
    t2 = np.array(idx2)
    t1 = np.array(label2)

    return metrics.acc(t2, t1), metrics.nmi(t2, t1)
Ejemplo n.º 17
0
def train_feature(net1, train_data):
    #
    if torch.cuda.is_available():
        net1 = torch.nn.DataParallel(net1, device_ids=[0])
        net1 = net1.cuda()
    #
    for i_dir in range(classnum):
        if not os.path.isdir('./data/' + str(i_dir)):
            os.makedirs('./data/' + str(i_dir))
    label_np = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0] * 10).reshape(10, 10)

    #
    label2 = []
    idx2 = []
    for im, label in tqdm(train_data, desc="Processing train data: "):
        # print(label)
        im = im.cuda()
        _, feat = net1(im)
        for i in range(feat.size(0)):
            distance = feat[i].cpu().numpy().tolist()
            idx = distance.index(max(distance))
            save_image(
                inver_transform2(im[i]), './data/' + str(idx) + '/' +
                str(random.randint(1, 10000000)) + '.png')
            # MATRIX
            label_np[idx][label[i].item()] += 1
            #
            label2.append(idx)
        label1 = label.numpy()
        for i in label1:
            idx2.append(i)

    t2 = np.array(idx2)
    t1 = np.array(label2)
    print('acc=%.4f, nmi=%.4f, ari=%.4f' %
          (metrics.acc(t1, t2), metrics.nmi(t1, t2), metrics.ari(t1, t2)))
    ##############################
    np.save(File + str(feat.size(1)) + '_' + '.npy', label_np)
Ejemplo n.º 18
0
 def on_epoch_end(
         self,
         epoch,
         logs=None):  #called at the end of every epoch?
     if int(epochs / 10) != 0 and epoch % int(
             epochs /
             10) != 0:  # 只在epochs的10%的迭代次数之内运行以下的print 代码
         return
     feature_model = Model(
         self.model.input,
         self.model.get_layer(
             'encoder_%d' %
             (int(len(self.model.layers) / 2) - 1)).output)
     features = feature_model.predict(
         self.x
     )  #pretrain训练的是自编码器部分,这里是encoder的输出,在embedding后的向量空间上进行KMEANS可以观察encoding的效果?
     km = KMeans(n_clusters=len(np.unique(self.y)),
                 n_init=20,
                 n_jobs=4)
     y_pred = km.fit_predict(features)
     # print()
     print(' ' * 8 + '|==>  acc: %.4f,  nmi: %.4f  <==|' %
           (metrics.acc(self.y, y_pred),
            metrics.nmi(self.y, y_pred)))
Ejemplo n.º 19
0
def run_net(data, params):
    #
    # UNPACK DATA
    #

    x_train_unlabeled, y_train_unlabeled, x_val, y_val, x_test, y_test = data[
        'spectral']['train_and_test']

    print(params['input_shape'])
    inputs_vae = Input(shape=params['input_shape'], name='inputs_vae')
    ConvAE = Conv.ConvAE(inputs_vae, params)
    try:
        ConvAE.vae.load_weights('vae_mnist.h5')
    except OSError:
        print('No pretrained weights available...')

    lh = LearningHandler(lr=params['spec_lr'],
                         drop=params['spec_drop'],
                         lr_tensor=ConvAE.learning_rate,
                         patience=params['spec_patience'])

    lh.on_train_begin()

    n_epochs = 5000
    losses_vae = np.empty((n_epochs, ))
    homo_plot = np.empty((n_epochs, ))
    nmi_plot = np.empty((n_epochs, ))
    ari_plot = np.empty((n_epochs, ))

    y_val = np.squeeze(np.asarray(y_val).ravel())  # squeeze into 1D array

    start_time = time.time()
    for i in range(n_epochs):
        # if i==0:
        x_recon, _, x_val_y = ConvAE.vae.predict(x_val)
        losses_vae[i] = ConvAE.train_vae(x_val, x_val_y, params['batch_size'])
        #x_val_y = ConvAE.vae.predict(x_val)[2]
        #y_sp = x_val_y.argmax(axis=1)
        #print_accuracy(y_sp, y_val, params['n_clusters'])
        print("Epoch: {}, loss={:2f}".format(i, losses_vae[i]))

        os.makedirs('vae', exist_ok=True)
        os.makedirs('vae_umap', exist_ok=True)

        fig, axs = plt.subplots(3, 4, figsize=(25, 18))
        fig.subplots_adjust(wspace=0.25)

        embedding = ConvAE.encoder.predict(x_val)
        kmeans = KMeans(n_clusters=params['n_clusters'], n_init=30)
        predicted_labels = kmeans.fit_predict(
            embedding)  # cluster on current embeddings for metric eval
        _, confusion_matrix = get_y_preds(predicted_labels, y_val,
                                          params['n_clusters'])

        homo_plot[i] = metrics.acc(y_val, predicted_labels)
        nmi_plot[i] = metrics.nmi(y_val, predicted_labels)
        ari_plot[i] = metrics.ari(y_val, predicted_labels)

        tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
        Z_tsne = tsne.fit_transform(embedding)
        sc = axs[1][0].scatter(Z_tsne[:, 0],
                               Z_tsne[:, 1],
                               s=2,
                               c=y_train_unlabeled,
                               cmap=plt.cm.get_cmap("jet", 14))
        axs[1][0].set_title('t-SNE Embeddings')
        axs[1][0].set_xlabel('t-SNE 1')
        axs[1][0].set_ylabel('t-SNE 2')
        axs[1][0].set_xticks([])
        axs[1][0].set_yticks([])
        axs[1][0].spines['right'].set_visible(False)
        axs[1][0].spines['top'].set_visible(False)
        divider = make_axes_locatable(axs[1][0])
        cax = divider.append_axes('right', size='15%', pad=0.05)
        cbar = fig.colorbar(sc,
                            cax=cax,
                            orientation='vertical',
                            ticks=range(params['n_clusters']))
        cbar.ax.set_yticklabels(
            params['cluster_names'])  # vertically oriented colorbar
        # Create offset transform by 5 points in x direction
        dx = 0 / 72.
        dy = -5 / 72.
        offset = matplotlib.transforms.ScaledTranslation(
            dx, dy, fig.dpi_scale_trans)

        # apply offset transform to all cluster ticklabels.
        for label in cbar.ax.yaxis.get_majorticklabels():
            label.set_transform(label.get_transform() + offset)

        reducer = umap.UMAP(transform_seed=36, random_state=36)
        matrix_reduce = reducer.fit_transform(embedding)
        sc = axs[1][1].scatter(matrix_reduce[:, 0],
                               matrix_reduce[:, 1],
                               s=2,
                               c=y_train_unlabeled,
                               cmap=plt.cm.get_cmap("jet", 14))
        axs[1][1].set_title('UMAP Embeddings')
        axs[1][1].set_xlabel('UMAP 1')
        axs[1][1].set_ylabel('UMAP 2')
        axs[1][1].set_xticks([])
        axs[1][1].set_yticks([])
        # Hide the right and top spines
        axs[1][1].spines['right'].set_visible(False)
        axs[1][1].spines['top'].set_visible(False)

        im = axs[1][2].imshow(confusion_matrix, cmap='YlOrRd')
        axs[1][2].set_title('Confusion Matrix')
        axs[1][2].set_xticks(range(params['n_clusters']))
        axs[1][2].set_yticks(range(params['n_clusters']))
        axs[1][2].set_xticklabels(params['cluster_names'], fontsize=8)
        axs[1][2].set_yticklabels(params['cluster_names'], fontsize=8)
        divider = make_axes_locatable(axs[1][2])
        cax = divider.append_axes('right', size='10%', pad=0.05)
        cbar = fig.colorbar(im, cax=cax, orientation='vertical', ticks=[])

        axs[0][0].plot(losses_vae[:i + 1])
        axs[0][0].set_title('VAE Loss')
        axs[0][0].set_xlabel('epochs')

        axs[0][1].plot(homo_plot[:i + 1])
        axs[0][1].set_title('Homogeneity')
        axs[0][1].set_xlabel('epochs')
        axs[0][1].set_ylim(0, 1)

        axs[0][2].plot(ari_plot[:i + 1])
        axs[0][2].set_title('ARI')
        axs[0][2].set_xlabel('epochs')
        axs[0][2].set_ylim(0, 1)

        axs[0][3].plot(nmi_plot[:i + 1])
        axs[0][3].set_title('NMI')
        axs[0][3].set_xlabel('epochs')
        axs[0][3].set_ylim(0, 1)

        #reconstructed_cell = ConvAE.vae.predict(x_val[:1, ...])[0, ..., 0]
        cell_tile = x_val[0, ..., 0]
        cell_tile = cell_tile[:, :64]
        x_recon = x_recon[0, ..., 0]
        reconstructed_cell_tile = x_recon[:, :64]
        reconstructed_cell_tile = np.flipud(reconstructed_cell_tile)
        cell_heatmap = np.vstack((cell_tile, reconstructed_cell_tile))
        axs[1][3].imshow(cell_heatmap, cmap='Reds')
        axs[1][3].set_xticks([])
        axs[1][3].set_yticks([])
        axs[1][3].spines['right'].set_visible(False)
        axs[1][3].spines['top'].set_visible(False)
        axs[1][3].spines['left'].set_visible(False)
        axs[1][3].spines['bottom'].set_visible(False)

        # get eigenvalues and eigenvectors
        scale = get_scale(embedding, params['batch_size'], params['scale_nbr'])
        values, vectors = spectral_clustering(embedding, scale,
                                              params['n_nbrs'],
                                              params['affinity'])

        # sort, then store the top n_clusters=2
        values_idx = np.argsort(values)
        x_spectral_clustering = vectors[:, values_idx[:params['n_clusters']]]

        # do kmeans clustering in this subspace
        y_spectral_clustering = KMeans(
            n_clusters=params['n_clusters']).fit_predict(
                vectors[:, values_idx[:params['n_clusters']]])

        tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
        Z_tsne = tsne.fit_transform(x_spectral_clustering)
        sc = axs[2][0].scatter(Z_tsne[:, 0],
                               Z_tsne[:, 1],
                               s=2,
                               c=y_train_unlabeled,
                               cmap=plt.cm.get_cmap("jet", 14))
        axs[2][0].set_title('Spectral Clusters (t-SNE) True Labels')
        axs[2][0].set_xlabel('t-SNE 1')
        axs[2][0].set_ylabel('t-SNE 2')
        axs[2][0].set_xticks([])
        axs[2][0].set_yticks([])
        axs[2][0].spines['right'].set_visible(False)
        axs[2][0].spines['top'].set_visible(False)

        reducer = umap.UMAP(transform_seed=36, random_state=36)
        matrix_reduce = reducer.fit_transform(x_spectral_clustering)
        axs[2][1].scatter(matrix_reduce[:, 0],
                          matrix_reduce[:, 1],
                          s=2,
                          c=y_spectral_clustering,
                          cmap=plt.cm.get_cmap("jet", 14))
        axs[2][1].set_title('Spectral Clusters (UMAP)')
        axs[2][1].set_xlabel('UMAP 1')
        axs[2][1].set_ylabel('UMAP 2')
        axs[2][1].set_xticks([])
        axs[2][1].set_yticks([])
        # Hide the right and top spines
        axs[2][1].spines['right'].set_visible(False)
        axs[2][1].spines['top'].set_visible(False)

        axs[2][2].scatter(matrix_reduce[:, 0],
                          matrix_reduce[:, 1],
                          s=2,
                          c=y_train_unlabeled,
                          cmap=plt.cm.get_cmap("jet", 14))
        axs[2][2].set_title('True Labels (UMAP)')
        axs[2][2].set_xlabel('UMAP 1')
        axs[2][2].set_ylabel('UMAP 2')
        axs[2][2].set_xticks([])
        axs[2][2].set_yticks([])
        # Hide the right and top spines
        axs[2][2].spines['right'].set_visible(False)
        axs[2][2].spines['top'].set_visible(False)

        axs[2][3].hist(x_spectral_clustering)
        axs[2][3].set_title("histogram of true eigenvectors")

        train_time = str(
            datetime.timedelta(seconds=(int(time.time() - start_time))))
        n_matrices = (i + 1) * params['batch_size'] * 100
        fig.suptitle('Trained on ' + '{:,}'.format(n_matrices) + ' cells\n' +
                     train_time)

        plt.savefig('vae/%d.png' % i)
        plt.close()

        plt.close()

        if i > 1:
            if np.abs(losses_vae[i] - losses_vae[i - 1]) < 0.0001:
                print('STOPPING EARLY')
                break

    print("finished training")

    plt.plot(losses_vae)
    plt.title('VAE Loss')
    plt.show()

    x_val_y = ConvAE.vae.predict(x_val)[2]
    # x_val_y = ConvAE.classfier.predict(x_val_lp)
    y_sp = x_val_y.argmax(axis=1)
    print_accuracy(y_sp, y_val, params['n_clusters'])
    from sklearn.metrics import normalized_mutual_info_score as nmi
    y_val = np.squeeze(np.asarray(y_val).ravel())  # squeeze into 1D array
    print(y_sp.shape, y_val.shape)
    nmi_score1 = nmi(y_sp, y_val)
    print('NMI: ' + str(np.round(nmi_score1, 4)))

    embedding = ConvAE.encoder.predict(x_val)
    tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
    Z_tsne = tsne.fit_transform(embedding)
    fig = plt.figure()
    plt.scatter(Z_tsne[:, 0],
                Z_tsne[:, 1],
                s=2,
                c=y_train_unlabeled,
                cmap=plt.cm.get_cmap("jet", 14))
    plt.colorbar(ticks=range(params['n_clusters']))
    plt.show()
Ejemplo n.º 20
0
Archivo: DCEC.py Proyecto: ryansar/DCEC
    def fit(self,
            x,
            y=None,
            batch_size=256,
            maxiter=2e4,
            tol=1e-3,
            update_interval=140,
            cae_weights=None,
            save_dir='./results/temp'):

        print('Update interval', update_interval)
        save_interval = x.shape[0] / batch_size * 5
        print('Save interval', save_interval)

        # Step 1: pretrain if necessary
        t0 = time()
        if not self.pretrained and cae_weights is None:
            print('...pretraining CAE using default hyper-parameters:')
            print('   optimizer=\'adam\';   epochs=200')
            self.pretrain(x, batch_size, save_dir=save_dir)
            self.pretrained = True
        elif cae_weights is not None:
            self.cae.load_weights(cae_weights)
            print('cae_weights is loaded successfully.')

        # Step 2: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        self.y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(self.y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # Step 3: deep clustering
        # logging file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(save_dir + '/dcec_log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L', 'Lc', 'Lr'])
        logwriter.writeheader()

        t2 = time()
        loss = [0, 0, 0]
        index = 0
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q, _ = self.model.predict(x, verbose=0)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                self.y_pred = q.argmax(1)
                if y is not None:
                    acc = np.round(metrics.acc(y, self.y_pred), 5)
                    nmi = np.round(metrics.nmi(y, self.y_pred), 5)
                    ari = np.round(metrics.ari(y, self.y_pred), 5)
                    loss = np.round(loss, 5)
                    logdict = dict(iter=ite,
                                   acc=acc,
                                   nmi=nmi,
                                   ari=ari,
                                   L=loss[0],
                                   Lc=loss[1],
                                   Lr=loss[2])
                    logwriter.writerow(logdict)
                    print('Iter', ite, ': Acc', acc, ', nmi', nmi, ', ari',
                          ari, '; loss=', loss)

                # check stop criterion
                delta_label = np.sum(self.y_pred != y_pred_last).astype(
                    np.float32) / self.y_pred.shape[0]
                y_pred_last = np.copy(self.y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    logfile.close()
                    break

            # train on batch
            if (index + 1) * batch_size > x.shape[0]:
                loss = self.model.train_on_batch(
                    x=x[index * batch_size::],
                    y=[p[index * batch_size::], x[index * batch_size::]])
                index = 0
            else:
                loss = self.model.train_on_batch(
                    x=x[index * batch_size:(index + 1) * batch_size],
                    y=[
                        p[index * batch_size:(index + 1) * batch_size],
                        x[index * batch_size:(index + 1) * batch_size]
                    ])
                index += 1

            # save intermediate model
            if ite % save_interval == 0:
                # save DCEC model checkpoints
                print('saving model to:',
                      save_dir + '/dcec_model_' + str(ite) + '.h5')
                self.model.save_weights(save_dir + '/dcec_model_' + str(ite) +
                                        '.h5')

            ite += 1

        # save the trained model
        logfile.close()
        print('saving model to:', save_dir + '/dcec_model_final.h5')
        self.model.save_weights(save_dir + '/dcec_model_final.h5')
        t3 = time()
        print('Pretrain time:  ', t1 - t0)
        print('Clustering time:', t3 - t1)
        print('Total time:     ', t3 - t0)
Ejemplo n.º 21
0
Archivo: DCEC.py Proyecto: ryansar/DCEC
    elif args.dataset == 'usps':
        x, y = load_usps('data/usps')
    elif args.dataset == 'mnist-test':
        x, y = load_mnist()
        x, y = x[60000:], y[60000:]

    # prepare the DCEC model
    dcec = DCEC(input_shape=x.shape[1:],
                filters=[32, 64, 128, 10],
                n_clusters=args.n_clusters)
    plot_model(dcec.model,
               to_file=args.save_dir + '/dcec_model.png',
               show_shapes=True)
    dcec.model.summary()

    # begin clustering.
    optimizer = 'adam'
    dcec.compile(loss=['kld', 'mse'],
                 loss_weights=[args.gamma, 1],
                 optimizer=optimizer)
    dcec.fit(x,
             y=y,
             tol=args.tol,
             maxiter=args.maxiter,
             update_interval=args.update_interval,
             save_dir=args.save_dir,
             cae_weights=args.cae_weights)
    y_pred = dcec.y_pred
    print('acc = %.4f, nmi = %.4f, ari = %.4f' % (metrics.acc(
        y, y_pred), metrics.nmi(y, y_pred), metrics.ari(y, y_pred)))
Ejemplo n.º 22
0
    "p123", "p125", "p128", "p129", "p133", "p135"
]
all_validation_image_names = ["p137", "p141", "p143", "p144", "p147"]
param_file_names = ["translation", "affine", "parameters_test"]
param_array = get_param_array(param_file_names)

nr_atlas_images = len(all_training_image_names)
for valid_img_name in all_validation_image_names:
    begin_time = time.time()
    print(f"{valid_img_name} |", end="\t", flush=True)
    valid_img_path = f"{VALIDATION_DATA_PATH}/{valid_img_name}/mr_bffe.mhd"
    valid_img = GetArrayFromImage(ReadImage(valid_img_path))
    weights = np.zeros(nr_atlas_images)
    predictions = np.zeros((nr_atlas_images, 86, 333, 271))
    for i, atlas_img_name in enumerate(all_training_image_names):
        print(f"{atlas_img_name}", end="\t", flush=True)
        atlas_mr_img_path = f"{TRAINING_DATA_PATH}/{atlas_img_name}/mr_bffe.mhd"
        atlas_pros_img_path = f"{TRAINING_DATA_PATH}/{atlas_img_name}/prostaat.mhd"
        transform = get_transform(valid_img_path, atlas_mr_img_path)
        transformed_atlas_mr_img = get_transformed_image(
            atlas_mr_img_path, transform)
        predictions[i] = get_transformed_image(atlas_pros_img_path, transform)
        weights[i] = metrics.nmi(valid_img, transformed_atlas_mr_img)
    weights = (weights - np.min(weights))**2
    prediction = np.zeros((86, 333, 271))
    for i in range(nr_atlas_images):
        prediction += predictions[i] * weights[i]
    prediction = (prediction > 0.45 * np.sum(weights)).astype(np.uint8)
    write_mhd(valid_img_name, prediction)
    print(time.time() - begin_time)
Ejemplo n.º 23
0
    dec = STC(dims=[x.shape[-1], 500, 500, 2000, 20], n_clusters=n_clusters)

    # pretrain model
    ####################################################################################
    #if not os.path.exists(args.ae_weights):
    dec.pretrain(x=x,
                 y=None,
                 optimizer='adam',
                 epochs=args.pretrain_epochs,
                 batch_size=args.batch_size,
                 save_dir=args.save_dir)
    #else:
    #    dec.autoencoder.load_weights(args.ae_weights)

    dec.model.summary()
    t0 = time()
    dec.compile(SGD(0.1, 0.9), loss='kld')

    # clustering
    ####################################################################################
    y_pred = dec.fit(x,
                     y=y,
                     tol=args.tol,
                     maxiter=args.maxiter,
                     batch_size=args.batch_size,
                     update_interval=args.update_interval,
                     save_dir=args.save_dir,
                     rand_seed=0)
    print('acc:', metrics.acc(y, y_pred))
    print('nmi', metrics.nmi(y, y_pred))
Ejemplo n.º 24
0
    def fit(self,
            x,
            y=None,
            batch_size=256,
            epochs=100,
            ae_weights=None,
            save_dir='result/temp',
            tol=0.001,
            use_sp=True,
            da_s2=False):

        # prepare folder for saving results
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # pretraining
        t0 = time()
        if ae_weights is None and not self.pretrained:
            print('Pretraining AE...')
            self.pretrain(x, save_dir=save_dir)
            print('Pretraining time: %.1fs' % (time() - t0))
        elif ae_weights is not None:
            self.autoencoder.load_weights(ae_weights)
            print('Pretrained AE weights are loaded successfully!')

        # initialization
        t1 = time()
        self.y_pred, self.centers = self.basic_clustering(self.predict(x))
        t2 = time()
        print('Time for initialization: %.1fs' % (t2 - t1))

        # logging file
        logfile = open(save_dir + '/log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['epoch', 'acc', 'nmi', 'Ln', 'Lc'])
        logwriter.writeheader()

        best_ACC = 0  ##这里我加了一个best_ACC,这样的话,我们就能进行比较了
        net_loss = 0
        clustering_loss = 0
        time_train = 0
        sample_weight = np.ones(shape=x.shape[0])
        sample_weight[self.y_pred == -1] = 0  # do not use the noisy examples
        y_pred_last = np.copy(self.y_pred)
        result = None
        for epoch in range(epochs + 1):
            """ Log and check stopping criterion """
            if y is not None:
                acc = np.round(metrics.acc(y, self.y_pred), 5)
                nmi = np.round(metrics.nmi(y, self.y_pred), 5)
                print(
                    'Epoch-%d: ACC=%.4f, NMI=%.4f, Ln=%.4f, Lc=%.4f; time=%.1f'
                    % (epoch, acc, nmi, net_loss, clustering_loss, time_train))
                logwriter.writerow(
                    dict(epoch=epoch,
                         acc=acc,
                         nmi=nmi,
                         Ln=net_loss,
                         Lc=clustering_loss))
                logfile.flush()

                # record the initial result
                if epoch == 0:
                    print('ASPC model saved to \'%s/model_init.h5\'' %
                          save_dir)
                    self.model.save_weights(save_dir + '/model_init.h5')
                ##进行比较
                if acc > best_ACC:
                    self.model.save_weights(save_dir + '/model_best.h5')
                    best_ACC = acc
                # check stop criterion
                delta_y = np.sum(self.y_pred != y_pred_last).astype(
                    np.float32) / self.y_pred.shape[0]
                y_pred_last = np.copy(self.y_pred)
                if (epoch > 0 and delta_y < tol) or epoch >= epochs:
                    result = np.asarray([acc, nmi])
                    print(
                        'Training stopped: epoch=%d, delta_label=%.4f, tol=%.4f'
                        % (epoch, delta_y, tol))
                    print('ASPC model saved to \'%s/model_final.h5\'' %
                          save_dir)
                    print('-' * 30 + ' END: time=%.1fs ' % (time() - t0) +
                          '-' * 30)
                    self.model.save_weights(save_dir + '/model_final.h5')
                    logfile.close()
                    break
            """ Step 1: train the network """
            t0_epoch = time()
            if da_s2:  # use data augmentation
                history = self.model.fit_generator(
                    generator(self.datagen, x, self.centers[self.y_pred],
                              sample_weight, batch_size),
                    steps_per_epoch=math.ceil(x.shape[0] / batch_size),
                    epochs=5
                    if np.any(self.y_pred == -1) and epoch == 0 else 1,
                    workers=4,
                    verbose=0)
            else:
                history = self.model.fit(x,
                                         y=self.centers[self.y_pred],
                                         batch_size=batch_size,
                                         epochs=1,
                                         sample_weight=sample_weight,
                                         verbose=0)
            net_loss = history.history['loss'][0]
            """ Step 2: update labels """
            self.y_pred, losses = self.update_labels(self.predict(x),
                                                     self.centers)
            clustering_loss = np.mean(losses)
            """ Step 3: Compute sample weights """
            sample_weight = self.compute_sample_weight(
                losses, epoch, epochs) if use_sp else None

            time_train = time() - t0_epoch

        return result
Ejemplo n.º 25
0
def get_normalized_nmi_weight(input_fixed_img, input_moving_img):
    nmi = metrics.nmi(input_fixed_img, input_moving_img)
    return nmi - 1.00591886842159
Ejemplo n.º 26
0
    def fit(self,
            x,
            y=None,
            maxiter=2e4,
            batch_size=256,
            tol=1e-3,
            update_interval=140,
            save_dir='./results/temp'):

        print('Update interval', update_interval)
        save_interval = int(x.shape[0] / batch_size) * 5  # 5 epochs
        print('Save interval', save_interval)

        # Step 1: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # Step 2: deep clustering
        # logging file
        import csv
        logfile = open(save_dir + '/dec_log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss'])
        logwriter.writeheader()

        loss = 0
        index = 0
        index_array = np.arange(x.shape[0])
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q = self.model.predict(x, verbose=0)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                y_pred = q.argmax(1)
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    loss = np.round(loss, 5)
                    logdict = dict(iter=ite,
                                   acc=acc,
                                   nmi=nmi,
                                   ari=ari,
                                   loss=loss)
                    logwriter.writerow(logdict)
                    print(
                        'Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' %
                        (ite, acc, nmi, ari), ' ; loss=', loss)

                # check stop criterion
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                y_pred_last = np.copy(y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    logfile.close()
                    break

            # train on batch
            # if index == 0:
            #     np.random.shuffle(index_array)
            idx = index_array[index * batch_size:min((index + 1) *
                                                     batch_size, x.shape[0])]
            loss = self.model.train_on_batch(x=x[idx], y=p[idx])
            index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

            # save intermediate model
            if ite % save_interval == 0:
                print('saving model to:',
                      save_dir + '/DEC_model_' + str(ite) + '.h5')
                self.model.save_weights(save_dir + '/DEC_model_' + str(ite) +
                                        '.h5')

            ite += 1

        # save the trained model
        logfile.close()
        print('saving model to:', save_dir + '/DEC_model_final.h5')
        self.model.save_weights(save_dir + '/DEC_model_final.h5')

        return y_pred
Ejemplo n.º 27
0
    def fit(self,
            x_train,
            x_val,
            x_test,
            model_name,
            outdir,
            df_columns,
            y=None,
            epoch=500,
            batch_size=256,
            update_interval=5,
            early_stopping=20,
            tol=0.01):

        print('Update interval', update_interval)
        # Step 1: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        encoder_out = self.encoder.predict(x_train)
        y_pred = kmeans.fit_predict(encoder_out)
        # y_pred = kmeans.fit_predict(x_train)
        if y is not None:
            acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            print('kmans: acc = %.5f, nmi = %.5f, ari = %.5f' %
                  (acc, nmi, ari))
            X_embedded = TSNE(n_components=2).fit_transform(encoder_out)
            plt.figure(figsize=(12, 10))
            plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
            plt.colorbar()
            plt.show()
        print(np.bincount(y_pred))
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # for ite in range(int(epoch)):
        #     if ite % update_interval == 0:
        #         q,_,_ = self.model.predict(x_train, verbose=0)
        #         p = self.target_distribution(q)  # update the auxiliary target distribution p
        #     y0 = np.zeros_like(x_train)
        #     self.model.fit(x=x_train, y=[p, y0, x_train], batch_size=batch_size)

        # Step 2: deep clustering
        index = 0
        index_array_train = np.arange(x_train.shape[0])
        index_array_val = np.arange(x_val.shape[0])
        cost_val = []
        cost_train = []
        for ite in range(int(epoch)):
            if ite % update_interval == 0:
                q, _, _ = self.model.predict(x_train, verbose=0)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p
                y_pred = q.argmax(1)
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                print("delta label:{}".format(delta_label))
                y_pred_last = np.copy(y_pred)
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    print('acc = %.5f, nmi = %.5f, ari = %.5f' %
                          (acc, nmi, ari))
                print(np.bincount(y_pred))
                if ite > update_interval and delta_label < tol:
                    # and np.mean(cost_val[-(early_stopping + 1):-1]) > \
                    # np.mean(cost_val[-(early_stopping*2 + 1):-(early_stopping + 1)])\
                    # and np.mean(cost_train[-(early_stopping + 1):-1]) < \
                    # np.mean(cost_train[-(early_stopping*2 + 1):-(early_stopping + 1)]):
                    print("Early stopping...")
                    break

            # train on batch
            tot_train_loss = 0.
            tot_sparse_loss = 0.
            tot_mse_loss = 0.
            tot_cluster_loss = 0.
            while True:
                if index == 0:
                    np.random.shuffle(index_array_train)
                idx = index_array_train[index * batch_size:min(
                    (index + 1) * batch_size, x_train.shape[0])]
                y0 = np.zeros_like(x_train[idx])
                # cluster_loss, sparse_loss, mse_loss = self.model.train_on_batch(x=x_train[idx], y=[p[idx], y0, x_train[idx]])
                loss, cluster_loss, sparse_loss, mse_loss = self.model.train_on_batch(
                    x=x_train[idx], y=[p[idx], y0, x_train[idx]])
                index = index + 1 if (
                    index + 2) * batch_size <= x_train.shape[0] else 0
                tot_train_loss += loss * len(idx)
                tot_cluster_loss += cluster_loss * len(idx)
                tot_mse_loss += mse_loss * len(idx)
                tot_sparse_loss += sparse_loss * len(idx)
                if index == 0:
                    break
            avg_train_loss = tot_train_loss / x_train.shape[0]
            avg_cluster_loss = tot_cluster_loss / x_train.shape[0]
            avg_mse_loss = tot_mse_loss / x_train.shape[0]
            avg_sparse_loss = tot_sparse_loss / x_train.shape[0]
            print(
                "epoch {}th train, train_loss :{:.6f}, cluster_loss: {:.6f}, mse_loss: {:.6f}, sparse_loss: {:.6f}\n"
                .format(ite + 1, avg_train_loss, avg_cluster_loss,
                        avg_mse_loss, avg_sparse_loss))
            cost_train.append(avg_train_loss)
            #
            # tot_val_loss = 0.
            # tot_sparse_loss = 0.
            # tot_mse_loss = 0.
            # tot_cluster_loss = 0.
            # while True:
            #     if index == 0:
            #         np.random.shuffle(index_array_val)
            #     idx = index_array_val[index * batch_size: min((index+1) * batch_size, x_val.shape[0])]
            #     y0 = np.zeros_like(x_val[idx])
            #     loss, cluster_loss, sparse_loss, mse_loss = self.model.test_on_batch(x=x_val[idx], y=[p[idx], y0, x_val[idx]])
            #     index = index + 1 if (index + 2) * batch_size <= x_val.shape[0] else 0
            #     tot_cluster_loss += cluster_loss *len(idx)
            #     tot_mse_loss += mse_loss *len(idx)
            #     tot_sparse_loss += sparse_loss *len(idx)
            #     tot_val_loss += loss * len(idx)
            #     if index==0:
            #         break
            # avg_val_loss = tot_val_loss / x_val.shape[0]
            # avg_cluster_loss = tot_cluster_loss / x_val.shape[0]
            # avg_mse_loss = tot_mse_loss / x_val.shape[0]
            # avg_sparse_loss = tot_sparse_loss / x_val.shape[0]
            # print("epoch {}th validate, loss: {:.6f}, cluster_loss: {:.6f}, mse_loss: {:.6f}, sparse_loss: {:.6f}\n".format(ite + 1,
            #                                                                                      avg_val_loss, avg_cluster_loss,
            #                                                                                      avg_mse_loss,
            #                                                                                      avg_sparse_loss))
            # cost_val.append(avg_val_loss)

        print('training time: ', time() - t1)
        # save the trained model

        print("saving predict data...")
        encoder_out = self.encoder.predict(x_test)
        q, decoder_out, _ = self.model.predict(x_test)
        y_pred = q.argmax(1)
        if y is not None:
            print("orginal cluster proportion: {}".format(np.bincount(y)))
            acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            print('acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari))
            X_embedded = TSNE(n_components=2).fit_transform(encoder_out)
            plt.figure(figsize=(12, 10))
            plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
            plt.colorbar()
            plt.show()
        print(np.bincount(y_pred))
        print(np.bincount(y_pred))

        y_pred = kmeans.fit_predict(encoder_out)
        if y is not None:
            acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            print('kmeans : acc = %.5f, nmi = %.5f, ari = %.5f' %
                  (acc, nmi, ari))
        print(np.bincount(y_pred))

        decoder_sub = decoder_out + x_test
        df = pd.DataFrame(decoder_out, columns=df_columns)
        df_replace = pd.DataFrame(decoder_sub, columns=df_columns)

        outDir = os.path.join(outdir, model_name)
        if os.path.exists(outDir) == False:
            os.makedirs(outDir)
        outPath = os.path.join(outDir,
                               "{}.{}.complete".format(model_name, ite))

        df.to_csv(outPath, index=None, float_format='%.4f')
        df_replace.to_csv(outPath.replace(".complete", ".complete.sub"),
                          index=None,
                          float_format='%.4f')
        pd.DataFrame(encoder_out).to_csv(outPath.replace(
            ".complete", ".encoder.out"),
                                         float_format='%.4f')
        print("saving done!")
Ejemplo n.º 28
0
    def fit(self,
            x,
            y=None,
            maxiter=2e4,
            batch_size=256,
            tol=1e-3,
            update_interval=140,
            save_dir='./results/temp',
            aug_cluster=False):
        print('Begin clustering:', '-' * 60)
        print('Update interval', update_interval)
        save_interval = int(maxiter)  # only save the initial and final model
        print('Save interval', save_interval)

        # Step 1: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        features = self.encoder.predict(x)
        y_pred = kmeans.fit_predict(features)
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # Step 2: deep clustering
        # logging file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(save_dir + '/log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss'])
        logwriter.writeheader()

        loss = 0
        index = 0
        index_array = np.arange(x.shape[0])
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q = self.predict(x)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                y_pred = q.argmax(1)
                avg_loss = loss / update_interval
                loss = 0.
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    logdict = dict(iter=ite,
                                   acc=acc,
                                   nmi=nmi,
                                   ari=ari,
                                   loss=avg_loss)
                    logwriter.writerow(logdict)
                    logfile.flush()
                    print('Iter %d: acc=%.5f, nmi=%.5f, ari=%.5f; loss=%.5f' %
                          (ite, acc, nmi, ari, avg_loss))

                # check stop criterion
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                y_pred_last = np.copy(y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    logfile.close()
                    break

            # save intermediate model
            if ite % save_interval == 0:
                print('saving model to:',
                      save_dir + '/model_' + str(ite) + '.h5')
                self.model.save_weights(save_dir + '/model_' + str(ite) +
                                        '.h5')

            # train on batch
            idx = index_array[index * batch_size:min((index + 1) *
                                                     batch_size, x.shape[0])]
            x_batch = self.random_transform(x[idx]) if aug_cluster else x[idx]
            loss += self.train_on_batch(x=x_batch, y=p[idx])
            index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

            ite += 1

        # save the trained model
        logfile.close()
        print('saving model to:', save_dir + '/model_final.h5')
        self.model.save_weights(save_dir + '/model_final.h5')
        print('Clustering time: %ds' % (time() - t1))
        print('End clustering:', '-' * 60)

        return y_pred
Ejemplo n.º 29
0
from multiEmbedding import metaembedding_load_search_snippet2,metaembedding_load_stackoverflow,load_tweet89,load_20ngnews

# from Tfidf import tf,tfidf

filename = 'data/20ngnews/20ngnews.txt'
x,y = load_20ngnews()

# x_tf = tf(filename)
# x_tfidf = tfidf(filename)


# print("x.shape: ",x.shape)
# print("x_tf.shape: ",x_tfidf.shape)
# print("x_tfidf.shape: ",x_tfidf.shape)
clusternum = len(set(y))
print("clusternum:",clusternum)
kmeans = KMeans(n_clusters= clusternum, n_init= 100)
y_pred = kmeans.fit_predict(x)
acc = np.round(metrics.acc(y, y_pred), 5)
# nmi值
nmi = np.round(metrics.nmi(y, y_pred), 5)

print('acc = %.5f, nmi = %.5f' % ( acc, nmi))







def run_clustering(doc_embeddings,
                   dims,
                   batch_size=16,
                   n_epochs=1,
                   update_interval=80,
                   tol=0.001,
                   y_real=None,
                   device="cpu"):

    inputs = torch.from_numpy(doc_embeddings).to(device)
    dataset = TensorDataset(inputs)
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=False)

    model = HybridModel(dims)
    enc_dec_model = {k[2:]: v for k, v in torch.load("enc_dec_model").items()}
    model.encoder.load_state_dict(enc_dec_model, strict=False)
    model.decoder.load_state_dict(enc_dec_model, strict=False)
    model = model.to(device)

    if os.path.exists("clustering_model"):
        model.load_state_dict(torch.load("clustering_model"))
        print("clustering model load from ckpt")

    model.train()

    optimizer = Adam(model.parameters(), lr=1e-3)

    criterion1 = nn.KLDivLoss(reduction="batchmean")
    criterion2 = nn.SmoothL1Loss()

    y_pred_last = np.zeros([doc_embeddings.shape[0]])

    is_end = False
    bst_model_acc = 0.0
    for epoch in range(n_epochs):
        if is_end:
            break
        batch_num = 1
        train_loss = 0.0
        for data in dataloader:

            if (batch_num - 1) % update_interval == 0:
                model.eval()
                with torch.no_grad():
                    _, q = model(inputs)
                    p = torch.Tensor(target_distribution(
                        q.cpu().numpy())).to(device)
                y_pred = q.cpu().numpy().argmax(1)

                if y_real is not None:
                    acc = np.round(metrics.acc(y_real, y_pred), 5)
                    nmi = np.round(metrics.nmi(y_real, y_pred), 5)
                    ari = np.round(metrics.ari(y_real, y_pred), 5)
                    print(
                        'Epoch %d, Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f'
                        % ((epoch + 1), batch_num, acc, nmi, ari))
                    if acc > bst_model_acc:
                        torch.save(model.state_dict(), "clustering_model")
                        bst_model_acc = acc

                # check stop criterion - model convergence
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                # print("delta_label: {}".format(delta_label))
                y_pred_last = np.copy(y_pred)
                model.train()

                if delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    is_end = True
                    break

            x_batch = data[0]
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            y_hat_dec_batch, y_hat_clu_batch = model(x_batch)
            y_batch = p[((batch_num - 1) * batch_size):(batch_num *
                                                        batch_size), :]
            loss1 = 1e-1 * criterion1(torch.log(y_hat_clu_batch),
                                      y_batch)  # torch.from_numpy(y_batch))
            loss2 = criterion2(y_hat_dec_batch, x_batch)
            loss = loss1 + loss2
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
            if batch_num - 1 % update_interval == 0:
                print("kld loss: {}, mse loss: {}".format(loss1, loss2))
                print("step loss: {}".format(train_loss / update_interval))
                train_loss = 0.0
            batch_num += 1

    torch.save(model.state_dict(), "clustering_model")

    model.eval()
    with torch.no_grad():
        _, q = model(inputs)
        q = q.cpu().numpy()
    return q.argmax(1)