Esempio n. 1
0
    def print_stats(self,
                    x,
                    y,
                    x_test,
                    y_test,
                    loss,
                    epoch,
                    logwriter,
                    prefix,
                    stats_path=None):
        q, _ = self.model.predict(x, verbose=0)
        # evaluate the clustering performance
        y_pred = q.argmax(1)

        acc = np.round(cluster_acc(y, y_pred), 5)
        nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
        ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
        loss = np.round(loss, 5)
        logdict = dict(iter=epoch,
                       acc=acc,
                       nmi=nmi,
                       ari=ari,
                       L=loss[0],
                       Lc=loss[1],
                       Lr=loss[2])
        logwriter.writerow(logdict)

        # compute constraints satisfaction
        sat = 0.0
        if ml_ind1 is not None and cl_ind1 is not None and len(ml_ind1) + len(
                cl_ind1) > 0:
            for i in range(len(ml_ind1)):
                if y_pred[ml_ind1[i]] == y_pred[ml_ind2[i]]:
                    sat += 1.0
            for i in range(len(cl_ind1)):
                if y_pred[cl_ind1[i]] != y_pred[cl_ind2[i]]:
                    sat += 1.0
            sat /= float(len(ml_ind2) + len(cl_ind1))
        if x_test is not None and y_test is not None:
            q_test, _ = self.model.predict(x_test, verbose=0)
            # evaluate the clustering performance
            y_pred_test = q_test.argmax(1)

            acc_test = np.round(cluster_acc(y_test, y_pred_test), 5)
            nmi_test = np.round(
                metrics.normalized_mutual_info_score(y_test, y_pred_test), 5)
            ari_test = np.round(
                metrics.adjusted_rand_score(y_test, y_pred_test), 5)
        print(prefix, ' sat: ', sat, 'ari:', ari, 'acc:', acc, 'nmi:', nmi,
              '   ###   ari_test:', ari_test, 'acc_test:', acc_test,
              'nmi_test:', nmi_test)

        if stats_path is not None:
            with open(stats_path, "a+") as file:
                content = self.dataset_name+';'+prefix+';'+self.save_suffix+';'+str(sat)+';'+str(ari)+';'+str(acc)+';'+\
                      str(nmi)+';'+str(ari_test)+';'+str(acc_test)+';'+str(nmi_test)+'\n'
                file.write(content)
        return sat
def train_dec(x, y, n_clusters, save_dir):
    batch_size = 256
    lr = 0.01
    momentum = 0.9
    tol = 0.001
    maxiter = 3e4
    update_interval = 1e3

    dec = DEC(dims=[x.shape[-1], 500, 500, 2000, 10],
              n_clusters=n_clusters,
              batch_size=batch_size)

    dec.initialize_model(optimizer=SGD(lr=lr, momentum=momentum),
                         ae_weights='../DEC-keras/ae_weights_snh.h5',
                         x=x)

    try:
        dec_snh.load_weights(save_dir + '/DEC_model_final.h5')
        y_pred = dec_snh.predict_clusters(x)
    except IOError:
        t0 = time()
        y_pred = dec_snh.clustering(x,
                                    y=y,
                                    tol=tol,
                                    maxiter=maxiter,
                                    update_interval=update_interval,
                                    save_dir=save_dir)
        print('clustering time: ', (time() - t0))
    print('acc:', cluster_acc(y, y_pred))
Esempio n. 3
0
def idec(dataset="mnist",
         gamma=0.1,
         maxiter=2e4,
         update_interval=20,
         tol=0.00001,
         batch_size=256):
    maxiter = maxiter
    gamma = gamma
    update_interval = update_interval
    tol = tol
    batch_size = batch_size
    ae_weights = ("ae_weights/" + dataset + "_ae_weights/" + dataset +
                  "_ae_weights.h5")

    optimizer = SGD(lr=0.01, momentum=0.9)
    from datasets import load_mnist, load_usps, load_stl, load_cifar
    if dataset == 'mnist':  # recommends: n_clusters=10, update_interval=140
        x, y = load_mnist('./data/mnist/mnist.npz')
        update_interval = 140
    elif dataset == 'usps':  # recommends: n_clusters=10, update_interval=30
        x, y = load_usps('data/usps')
        update_interval = 30
    # prepare the IDEC model
    elif dataset == "stl":
        import numpy as np
        x, y = load_stl()
        update_interval = 20
    elif dataset == "cifar_10":
        x, y = load_cifar()
        update_interval = 140
    batch_size = 120
    print gamma, dataset
    try:
        count = Counter(y)
    except:
        count = Counter(y[:, 0])
    n_clusters = len(count)
    save_dir = 'results/idec_dataset:' + dataset + " gamma:" + str(gamma)
    idec = IDEC(dims=[x.shape[-1], 500, 500, 2000, 10],
                n_clusters=n_clusters,
                batch_size=batch_size)
    idec.initialize_model(ae_weights=ae_weights,
                          gamma=gamma,
                          optimizer=optimizer)
    plot_model(idec.model, to_file='idec_model.png', show_shapes=True)
    idec.model.summary()

    # begin clustering, time not include pretraining part.
    t0 = time()
    y_pred = idec.clustering(x,
                             y=y,
                             tol=tol,
                             maxiter=maxiter,
                             update_interval=update_interval,
                             save_dir=save_dir)
    print 'acc:', cluster_acc(y, y_pred)
    print 'clustering time: ', (time() - t0)
def main():
    n_clusters = 10  # this is chosen based on prior knowledge of classes in the data set.
    batch_size = 256
    lr = 0.01  # learning rate
    momentum = 0.9
    # tolerance - if clustering stops if less than this fraction of the data changes cluster on an interation
    tol = 0.001

    maxiter = 2e4
    update_interval = 140
    save_dir = './results/dec'

    x, y = load_mnist()

    #training_set_sizes = [100]
    training_set_sizes = [500, 1000, 5000, 10000, 50000]
    # prepare the DEC model
    dec = DEC(dims=[x.shape[-1], 500, 500, 2000, 10],
              n_clusters=n_clusters,
              batch_size=batch_size)

    for training_set_size in training_set_sizes:
        x_train = x[:training_set_size]
        y_train = y[:training_set_size]
        ae_weights = './ae_weights_m%d.h5' % training_set_size
        dec.initialize_model(optimizer=SGD(lr=lr, momentum=momentum),
                             ae_weights=ae_weights,
                             x=x_train)
        t0 = time()
        y_pred = dec.clustering(x_train,
                                y=y_train,
                                tol=tol,
                                maxiter=maxiter,
                                update_interval=update_interval,
                                save_dir=save_dir + '/%d' % training_set_size)

        print('clustering time: ', (time() - t0))
        print('acc:', cluster_acc(y_train, y_pred))
Esempio n. 5
0
    def clustering(self,
                   x,
                   y=None,
                   tol=1e-3,
                   update_interval=140,
                   maxiter=2e4,
                   save_dir='./results/idec'):

        print
        'Update interval', update_interval
        save_interval = x.shape[0] / self.batch_size * 5  # 5 epochs
        print
        'Save interval', save_interval

        # initialize cluster centers using k-means
        print
        'Initializing cluster centers with k-means.'
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = y_pred
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # logging file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = file(save_dir + '/idec_log.csv', 'wb')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L', 'Lc', 'Lr'])
        logwriter.writeheader()

        loss = [0, 0, 0]
        index = 0
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q, _ = self.model.predict(x, verbose=0)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                y_pred = q.argmax(1)
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                y_pred_last = y_pred
                if y is not None:
                    acc = np.round(cluster_acc(y, y_pred), 5)
                    nmi = np.round(
                        metrics.normalized_mutual_info_score(y, y_pred), 5)
                    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
                    loss = np.round(loss, 5)
                    logdict = dict(iter=ite,
                                   acc=acc,
                                   nmi=nmi,
                                   ari=ari,
                                   L=loss[0],
                                   Lc=loss[1],
                                   Lr=loss[2])
                    logwriter.writerow(logdict)
                    print
                    'Iter', ite, ': Acc', acc, ', nmi', nmi, ', ari', ari, '; loss=', loss

                # check stop criterion
                if ite > 0 and delta_label < tol:
                    print
                    'delta_label ', delta_label, '< tol ', tol
                    print
                    'Reached tolerance threshold. Stopping training.'
                    logfile.close()
                    break

            # train on batch
            if (index + 1) * self.batch_size > x.shape[0]:
                loss = self.model.train_on_batch(
                    x=x[index * self.batch_size::],
                    y=[
                        p[index * self.batch_size::],
                        x[index * self.batch_size::]
                    ])
                index = 0
            else:
                loss = self.model.train_on_batch(
                    x=x[index * self.batch_size:(index + 1) * self.batch_size],
                    y=[
                        p[index * self.batch_size:(index + 1) *
                          self.batch_size],
                        x[index * self.batch_size:(index + 1) *
                          self.batch_size]
                    ])
                index += 1

            # save intermediate model
            if ite % save_interval == 0:
                # save IDEC model checkpoints
                print
                'saving model to:', save_dir + '/IDEC_model_' + str(
                    ite) + '.h5'
                self.model.save_weights(save_dir + '/IDEC_model_' + str(ite) +
                                        '.h5')

            ite += 1

        # save the trained model
        logfile.close()
        print
        'saving model to:', save_dir + '/IDEC_model_final.h5'
        self.model.save_weights(save_dir + '/IDEC_model_final.h5')

        return y_pred
Esempio n. 6
0
    if args.dataset == 'mnist':  # recommends: n_clusters=10, update_interval=140
        x, y = load_mnist()
        optimizer = 'adam'
    elif args.dataset == 'usps':  # recommends: n_clusters=10, update_interval=30
        x, y = load_usps('data/usps')
    elif args.dataset == 'reutersidf10k':  # recommends: n_clusters=4, update_interval=3
        x, y = load_reuters('data/reuters')

    # prepare the IDEC model
    idec = IDEC(dims=[x.shape[-1], 500, 500, 2000, 10],
                n_clusters=args.n_clusters,
                batch_size=args.batch_size)
    idec.initialize_model(ae_weights=args.ae_weights,
                          gamma=args.gamma,
                          optimizer=optimizer)
    plot_model(idec.model, to_file='idec_model.png', show_shapes=True)
    idec.model.summary()

    # begin clustering, time not include pretraining part.
    t0 = time()
    y_pred = idec.clustering(x,
                             y=y,
                             tol=args.tol,
                             maxiter=args.maxiter,
                             update_interval=args.update_interval,
                             save_dir=args.save_dir)
    print
    'acc:', cluster_acc(y, y_pred)
    print
    'clustering time: ', (time() - t0)
Esempio n. 7
0
    def fit(self,
            x,
            y=None,
            batch_size=256,
            maxiter=2e4,
            tol=1e-3,
            update_interval=140,
            ae_weights=None,
            save_dir='./results/idec'):

        print('Update interval', update_interval)
        save_interval = int(x.shape[0] / batch_size) * 5  # 5 epochs
        save_interval = 50
        print('Save interval', save_interval)

        # Step 1: pretrain
        if not self.pretrained and ae_weights is None:
            print(
                '...pretraining autoencoders using default hyper-parameters:')
            print('   optimizer=\'adam\';   epochs=200')
            self.pretrain(x, batch_size)
            self.pretrained = True
        elif ae_weights is not None:
            self.autoencoder.load_weights(ae_weights)
            print('ae_weights is loaded successfully.')

        # Step 2: initialize cluster centers using k-means
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=4)
        self.y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(self.y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        # Step 3: deep clustering
        # logging file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(save_dir + '/idec_log.csv', 'w')
        logwriter = csv.DictWriter(
            logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L', 'Lc', 'Lr'])
        logwriter.writeheader()

        loss = [0, 0, 0]
        index = 0
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q, _ = self.model.predict(x, verbose=0)
                p = self.target_distribution(
                    q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                self.y_pred = q.argmax(1)
                if y is not None:
                    acc = np.round(cluster_acc(y, self.y_pred), 5)
                    nmi = np.round(
                        metrics.normalized_mutual_info_score(y, self.y_pred),
                        5)
                    ari = np.round(metrics.adjusted_rand_score(y, self.y_pred),
                                   5)
                    loss = np.round(loss, 5)
                    logwriter.writerow(
                        dict(iter=ite,
                             acc=acc,
                             nmi=nmi,
                             ari=ari,
                             L=loss[0],
                             Lc=loss[1],
                             Lr=loss[2]))
                    print(
                        'Iter-%d: ACC= %.4f, NMI= %.4f, ARI= %.4f;  L= %.5f, Lc= %.5f,  Lr= %.5f'
                        % (ite, acc, nmi, ari, loss[0], loss[1], loss[2]))

                # check stop criterion
                delta_label = np.sum(self.y_pred != y_pred_last).astype(
                    np.float32) / self.y_pred.shape[0]
                y_pred_last = np.copy(self.y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    logfile.close()
                    break

            # train on batch
            if (index + 1) * batch_size > x.shape[0]:
                loss = self.model.train_on_batch(
                    x=x[index * batch_size::],
                    y=[p[index * batch_size::], x[index * batch_size::]])
                index = 0
            else:
                loss = self.model.train_on_batch(
                    x=x[index * batch_size:(index + 1) * batch_size],
                    y=[
                        p[index * batch_size:(index + 1) * batch_size],
                        x[index * batch_size:(index + 1) * batch_size]
                    ])
                index += 1

            # save intermediate model
            if ite % save_interval == 0:
                # save IDEC model checkpoints
                print('saving model to: ' + save_dir + '/IDEC_model_' +
                      str(ite) + '.h5')
                self.model.save_weights(save_dir + '/IDEC_model_' + str(ite) +
                                        '.h5')

            ite += 1

        # save the trained model
        logfile.close()
        print('saving model to: ' + save_dir + '/IDEC_model_final.h5')
        self.model.save_weights(save_dir + '/IDEC_model_final.h5')

        return self.y_pred
Esempio n. 8
0
    idec.compile(loss=['kld', 'mse'],
                 loss_weights=[args.gamma, 1],
                 optimizer=optimizer)
    idec.fit(x,
             y=y,
             batch_size=args.batch_size,
             tol=args.tol,
             maxiter=args.maxiter,
             update_interval=args.update_interval,
             ae_weights=args.ae_weights,
             save_dir=args.save_dir)

    # Show the final results
    y_pred = idec.y_pred
    print(y_pred)
    print('acc:', cluster_acc(y, y_pred))
    print('clustering time: %d seconds.' % int(time() - t0))
    embed = idec.encoder.predict(x)

    year = 1999
    emd_file = open("deep_embedding", 'w')
    for embedding in embed:
        emd_file.write(str(year) + ':')
        for vals in embedding:
            emd_file.write('\t' + str(vals))
        emd_file.write('\n')
        year += 1
    emd_file.close()

    year = 1999
    cl_file = open("deep_clustering", 'w')
def main():

    # constants
    batch_size = 256
    lr = 0.01
    momentum = 0.9
    tol = 0.001
    maxiter = 2e4
    update_interval = 140

    n_clusters = 10
    n_classes = 10

    lcolours = ['#D6FF79', '#B0FF92', '#A09BE7', '#5F00BA', '#56CBF9', \
                '#F3C969', '#ED254E', '#CAA8F5', '#D9F0FF', '#46351D']
    labels = [str(i) for i in range(n_clusters)]

    ae_weights = '../../../../DEC-keras/results/mnist/ae_weights.h5'
    dec_weights = '../../../../DEC-keras/results/mnist/%d/DEC_model_final.h5' % n_clusters

    # load mnist data set
    x, y = load_mnist()
    # split the data into training, validation and test sets
    m = x.shape[0]
    m = m - 20000
    sample_frac = 0.01
    split = int(sample_frac * m)
    print(split)
    x_train = x[:split]
    y_train = y[:split]
    x_valid = x[50000:60000]
    y_valid = y[50000:60000]
    x_test = x[60000:]
    y_test = y[60000:]

    # load pretrained DEC model
    dec = load_mnist_dec(x, ae_weights, dec_weights, n_clusters, \
      batch_size, lr, momentum)

    # predict training set cluster assignments
    y_pred = dec.predict_clusters(x_train)

    # inspect the clustering and simulate volunteer labelling of random sample (the training set)
    cluster_to_label_mapping, n_assigned_list, majority_class_fractions = \
      get_cluster_to_label_mapping(y_train, y_pred, n_classes, n_clusters)
    print(cluster_acc(y_train, y_pred))
    y_valid_pred = dec.predict_clusters(x_valid)
    print(cluster_acc(y_valid, y_valid_pred))

    # extract the cluster centres
    cluster_centres = get_cluster_centres(dec)

    # determine current unlabelled samples
    y_plot = np.array(y[:m], dtype='int')
    y_plot[split:] = -1

    # reduce embedding to 2D and plot labelled and unlabelled training set samples
    #pca_plot(dec.encoder, x[:m], cluster_centres, y=y_plot, labels=labels, \
    #           lcolours=lcolours)

    # get siamese training pairs
    im, cc, ls, cluster_to_label_mapping = \
      get_pairs_auto(dec, x_train, y_train, cluster_centres, \
        cluster_to_label_mapping, majority_class_fractions, n_clusters)

    #im, cc, ls, cluster_to_label_mapping = \
    #  get_pairs_auto_with_noise(dec, x_train, y_train, cluster_centres, \
    #    cluster_to_label_mapping, majority_class_fractions, n_clusters)
    """
  mcheckpointer = ModelCheckpoint(filepath='saved_models/weights.best..hdf5', \
                                  verbose=1, save_best_only=True)

  base_network = Model(dec.model.input, \
    dec.model.get_layer('encoder_%d' % (dec.n_stacks - 1)).output)
  fcheckpointer = FrameDumpCallback(base_network, x, cluster_centres, \
    './video', y=y_plot, labels=labels, lcolours=lcolours)
  """
    #callbacks = [mcheckpointer, fcheckpointer]
    callbacks = []

    model, base_network = train_siamese(dec, cluster_centres, im, cc, ls, \
      epochs=5, split_frac=0.75, callbacks=callbacks)
    #model, base_network = train_siamese_online(dec, x, cluster_centres, im, cc, ls, \
    #  epochs=1, split_frac=0.75, callbacks=[])

    y_pred = dec.predict_clusters(x_valid)

    cluster_to_label_mapping, n_assigned_list, majority_class_fractions = \
      get_cluster_to_label_mapping(y_valid, y_pred, n_classes, n_clusters)
    print(cluster_acc(y_valid, y_pred))
    #pca_plot(dec.encoder, x_valid, cluster_centres, y=y_valid, labels=labels, \
    #           lcolours=lcolours)

    y_pred = dec.predict_clusters(x[:m])
    print(np.argmin(majority_class_fractions))

    for j in range(1, 6):
        selection = np.where(
            y_pred[j * split:(j + 1) *
                   split] == np.argmin(majority_class_fractions))
        x_train = np.concatenate(
            (x_train, x[:m][j * split:(j + 1) * split][selection]))
        y_train = np.concatenate(
            (y_train, y[:m][j * split:(j + 1) * split][selection]))

        im, cc, ls, cluster_to_label_mapping = \
          get_pairs_auto(dec, x_train, y_train, cluster_centres, \
            cluster_to_label_mapping, majority_class_fractions, n_clusters)

        callbacks = []

        model, base_network = train_siamese(dec, cluster_centres, im, cc, ls, \
          epochs=1, split_frac=0.75, callbacks=callbacks)

        #x_train = x[:2*split]
        #y_train = y[:2*split]
        #y_pred = dec.predict_clusters(x_train)

        #cluster_to_label_mapping, n_assigned_list, majority_class_fractions = \
        #  get_cluster_to_label_mapping(y_train, y_pred, n_classes, n_clusters)

        y_pred = dec.predict_clusters(x_valid)

        cluster_to_label_mapping, n_assigned_list, majority_class_fractions = \
          get_cluster_to_label_mapping(y_valid, y_pred, n_classes, n_clusters)
        print(cluster_acc(y_valid, y_pred))