Ejemplo n.º 1
0
filename = 'pendigits.pkl.gz'
path = '/home/bo/Data/Pendigits/'
dataset = path + filename

# perform KM

with gzip.open(dataset, 'rb') as f:
    train_x, train_y = cPickle.load(f)

km_model = KMeans(n_clusters=K, n_init=1)
results_KM = np.zeros((trials, 3))
for i in range(trials):
    ypred = km_model.fit_predict(train_x)
    nmi = metrics.adjusted_mutual_info_score(train_y, ypred)
    ari = metrics.adjusted_rand_score(train_y, ypred)
    ac = acc(ypred, train_y)
    results_KM[i] = np.array([nmi, ari, ac])

KM_mean = np.mean(results_KM, axis=0)
KM_std = np.std(results_KM, axis=0)
# Perform SC
print('SC started...')
results_SC = np.zeros((trials, 3))
se_model = SpectralEmbedding(n_components=K, affinity='rbf', gamma=0.1)
se_vec = se_model.fit_transform(train_x)
for i in range(trials):
    ypred = km_model.fit_predict(se_vec)
    nmi = metrics.adjusted_mutual_info_score(train_y, ypred)
    ari = metrics.adjusted_rand_score(train_y, ypred)
    ac = acc(ypred, train_y)
    results_SC[i] = np.array([nmi, ari, ac])
Ejemplo n.º 2
0
from sklearn import metrics
from cluster_acc import acc

nmi = metrics.normalized_mutual_info_score(train_label,my_dp.z)
ari = metrics.adjusted_rand_score(train_label,my_dp.z)
label_uni = len(np.unique(my_dp.z))
label_true_uni = len(np.unique(train_label))
    if label_uni <label_true_uni+1:
        acc_value = acc(my_dp.z,train_label)
    else:
        acc_value=0
Ejemplo n.º 3
0
def test_SdC(Init = '', lbd = .01, output_dir='MNIST_results', save_file = '', beta = 1, finetune_lr= .005, mu = 0.9, pretraining_epochs=50,
             pretrain_lr=.001, training_epochs=150,
             dataset='toy.pkl.gz', batch_size=20, nClass = 4, hidden_dim = [100, 50, 2],  diminishing = True):
    """
    :type Init: string
    :param Init: a string contains the filename of a saved network, the saved network can be loaded to initialize 
                 the network. Leave this parameter be an empty string if no saved network available. If failed to 
                 find the specified file, the program will initialized the network randomly.
                 
    :type lbd: float
    :param lbd: tuning parameter, multiplied on reconstruction error, i.e. the larger
                lbd the larger weight on minimizing reconstruction error.
                
    :type output_dir: string
    :param output_dir: the location to save trained network
    
    :type save_file: string
    :param save_file: the filename to save trained network
    
    :type beta: float
    :param beta: the parameter for the clustering term, set to 0 if a pure SAE (without clustering regularization)
                 is intended.          
                
    :type finetune_lr: float
    :param finetune_lr: learning rate used in the finetune stage
    (factor for the stochastic gradient)
    
    :type mu: float
    :param mu: extrapolation parameter used for implementing Nesterov-type acceleration
    
    :type pretraining_epochs: int
    :param pretraining_epochs: number of epoch to do pretraining

    :type pretrain_lr: float
    :param pretrain_lr: learning rate to be used during pre-training
    
    :type training_epochs: int
    :param training_epochs: number of epoch to do optimization

    :type dataset: string
    :param dataset: path of the pickled dataset
    
    :type batch_size: int
    :param batch_size: number of data samples in one minibatch
    
    :type nClass: int
    :param nClass: number of clusters
    
    :hidden dim: array
    :param hidden_dim: the number of neurons in each hidden layer in the forward network, the reconstruction part 
                       has a mirror-image structure

    :type diminishing: boolean
    :param diminishing: whether or not to reduce learning rate during optimization, if True, the learning rate is 
                        halfed every 5 epochs.
    """    
    datasets = load_data_shared(dataset, batch_size)  
    
    working_dir = os.getcwd()
    train_set_x,  train_set_y  = datasets[0]    
    inDim = train_set_x.get_value().shape[1]
    label_true = numpy.squeeze(numpy.int32(train_set_y.get_value(borrow=True)))
    
    index = T.lscalar() 
    x = T.matrix('x')
    
    # compute number of minibatches for training, validation and testing
    n_train_samples = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches = n_train_samples
    n_train_batches /= batch_size

    # numpy random generator
    # start-snippet-3
    numpy_rng = numpy.random.RandomState(89677)
#    numpy_rng = numpy.random.RandomState()
    print '... building the model'
    try:
        os.chdir(output_dir)
    except OSError:
        os.mkdir(output_dir)
        os.chdir(output_dir)
    # construct the stacked denoising autoencoder class
    if Init == '':
        sdc = SdC(
            numpy_rng=numpy_rng,
            n_ins=inDim,
            lbd = lbd, 
            beta = beta,
            input=x,
            hidden_layers_sizes= hidden_dim
        )
    else:
        try:
            with gzip.open(Init, 'rb') as f:
                saved_params = cPickle.load(f)['network']
            sdc = SdC(
                    numpy_rng=numpy_rng,
                    n_ins=inDim,
                    lbd = lbd, 
                    beta = beta,
                    input=x,
                    hidden_layers_sizes= hidden_dim,
                    Param_init = saved_params
                )
            print '... loading saved network succeeded'
        except IOError:
            print >> sys.stderr, ('Cannot find the specified saved network, using random initializations.')
            sdc = SdC(
                    numpy_rng=numpy_rng,
                    n_ins=inDim,
                    lbd = lbd, 
                    beta = beta,
                    input=x,
                    hidden_layers_sizes= hidden_dim
                )           
        
    #########################
    # PRETRAINING THE MODEL #
    #########################
    if pretraining_epochs == 0 or Init != '':
        print '... skipping pretraining'
    else:       
        print '... getting the pretraining functions'
        pretraining_fns = sdc.pretraining_functions(train_set_x=train_set_x,
                                                    batch_size=batch_size, mu = mu)

        print '... pre-training the model'
        start_time = timeit.default_timer()
        ## Pre-train layer-wise
        corruption_levels = 0*numpy.ones(len(hidden_dim), dtype = numpy.float32)
        
        pretrain_lr_shared = theano.shared(numpy.asarray(pretrain_lr,
                                                       dtype='float32'),
                                         borrow=True)
        for i in xrange(sdc.n_layers):
            # go through pretraining epochs
            iter = 0
            for epoch in xrange(pretraining_epochs):
                # go through the training set  
                c = []  
                for batch_index in xrange(n_train_batches):
                    iter = (epoch) * n_train_batches + batch_index 
                    pretrain_lr_shared.set_value( numpy.float32(pretrain_lr) )
                    cost = pretraining_fns[i](index=batch_index,
                             corruption=corruption_levels[i],
                             lr=pretrain_lr_shared.get_value())                         
                    c.append(cost)
                    
                print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
                print numpy.mean(c)

        end_time = timeit.default_timer()

        print >> sys.stderr, ('The pretraining code for file ' +
                              os.path.split(__file__)[1] +
                              ' ran for %.2fm' % ((end_time - start_time) / 60.))
        
        network = [param.get_value() for param in sdc.params]    
        package = {'network': network}            
        with gzip.open('deepclus_'+str(nClass)+ '_pretrain.pkl.gz', 'wb') as f:
            cPickle.dump(package, f, protocol=cPickle.HIGHEST_PROTOCOL)

    ########################
    # FINETUNING THE MODEL #
    ########################
    
    
    km = KMeans(n_clusters = nClass)   
    
    out = sdc.get_output()
    out_sdc = theano.function(
        [index],
        outputs = out,
        givens = {x: train_set_x[index * batch_size: (index + 1) * batch_size]}
    )  
    hidden_val = [] 
    for batch_index in xrange(n_train_batches):
         hidden_val.append(out_sdc(batch_index))
    
    hidden_array  = numpy.asarray(hidden_val)
    hidden_size = hidden_array.shape        
    hidden_array = numpy.reshape(hidden_array, (hidden_size[0] * hidden_size[1], hidden_size[2] ))

    hidden_zero = numpy.zeros_like(hidden_array)
    
    zeros_count = numpy.sum(numpy.equal(hidden_array, hidden_zero), axis = 0)       
    
#    # Do a k-means clusering to get center_array  
    km_idx = km.fit_predict(hidden_array)
    centers = km.cluster_centers_.astype(numpy.float32)     
    center_shared =  theano.shared(numpy.zeros((batch_size, hidden_dim[-1]) ,
                                                   dtype='float32'),
                                     borrow=True)
    nmi = metrics.normalized_mutual_info_score(label_true, km_idx)
    print >> sys.stderr, ('Initial NMI for deep clustering: %.2f' % (nmi))
    
    ari = metrics.adjusted_rand_score(label_true, km_idx)
    print >> sys.stderr, ('ARI for deep clustering: %.2f' % (ari))
    
    try:
        ac = acc(km_idx, label_true)
    except AssertionError:
        ac = 0
        print('Number of predicted cluster mismatch with ground truth.')
        
    print >> sys.stderr, ('ACC for deep clustering: %.2f' % (ac))                              
    lr_shared = theano.shared(numpy.asarray(finetune_lr,
                                                   dtype='float32'),
                                     borrow=True)

    print '... getting the finetuning functions'   
       
    train_fn = sdc.build_finetune_functions(
        datasets=datasets,
        centers=center_shared ,
        batch_size=batch_size,
        mu = mu,
        learning_rate=lr_shared
    )

    print '... finetunning the model'

    start_time = timeit.default_timer()
    done_looping = False
    epoch = 0
    
    res_metrics = numpy.zeros((training_epochs/5 + 1, 3), dtype = numpy.float32)
    res_metrics[0] = numpy.array([nmi, ari, ac])
    
    count = 100*numpy.ones(nClass, dtype = numpy.int)
    while (epoch < training_epochs) and (not done_looping):
        epoch = epoch + 1    
        c = [] # total cost
        d = [] # cost of reconstruction    
        e = [] # cost of clustering 
        f = [] # learning_rate
        g = []
        # count the number of assigned  data sample
        # perform random initialization of centroid if empty cluster happens
        count_samples = numpy.zeros((nClass)) 
        for minibatch_index in xrange(n_train_batches):
            # calculate the stepsize
            iter = (epoch - 1) * n_train_batches + minibatch_index 
            lr_shared.set_value( numpy.float32(finetune_lr) )
            center_shared.set_value(centers[km_idx[minibatch_index * batch_size: (minibatch_index +1 ) * batch_size]])
#            lr_shared.set_value( numpy.float32(finetune_lr/numpy.sqrt(epoch)) )
            cost = train_fn(minibatch_index)
            hidden_val = out_sdc(minibatch_index) # get the hidden value, to update KM
            # Perform mini-batch KM
            temp_idx, centers, count = batch_km(hidden_val, centers, count)
#            for i in range(nClass):
#                count_samples[i] += temp_idx.shape[0] - numpy.count_nonzero(temp_idx - i)             
#            center_shared.set_value(numpy.float32(temp_center))
            km_idx[minibatch_index * batch_size: (minibatch_index +1 ) * batch_size] = temp_idx
            c.append(cost[0])
            d.append(cost[1])
            e.append(cost[2])
            f.append(cost[3])
            g.append(cost[4])

        # check if empty cluster happen, if it does random initialize it
#        for i in range(nClass):
#            if count_samples[i] == 0:
#                rand_idx = numpy.random.randint(low = 0, high = n_train_samples)
#                # modify the centroid
#                centers[i] = out_single(rand_idx)                
        
        print 'Fine-tuning epoch %d ++++ \n' % (epoch), 
        print ('Total cost: %.5f, '%(numpy.mean(c)) + 'Reconstruction: %.5f, ' %(numpy.mean(d)) 
            + "Clustering: %.5f, " %(numpy.mean(e)) )
#        print 'Learning rate: %.6f' %numpy.mean(f)
        
        # half the learning rate every 5 epochs
        if epoch % 10 == 0 and diminishing == True:
            finetune_lr /= 2
            
#         evaluate the clustering performance every 5 epoches      
        if epoch % 5 == 0:            
            nmi = metrics.normalized_mutual_info_score(label_true, km_idx)                
            ari = metrics.adjusted_rand_score(label_true, km_idx)                
            try:
                ac = acc(km_idx, label_true)
            except AssertionError:
                ac = 0
                print('Number of predicted cluster mismatch with ground truth.')    
            res_metrics[epoch/5] = numpy.array([nmi, ari, ac])

    # get the hidden values, to make a plot
    hidden_val = [] 
    for batch_index in xrange(n_train_batches):
         hidden_val.append(out_sdc(batch_index))    
    hidden_array  = numpy.asarray(hidden_val)
    hidden_size = hidden_array.shape        
    hidden_array = numpy.reshape(hidden_array, (hidden_size[0] * hidden_size[1], hidden_size[2] ))
        
    err = numpy.mean(d)
    print >> sys.stderr, ('Average squared 2-D reconstruction error: %.4f' %err)
    end_time = timeit.default_timer()
    ypred = km_idx
    
    nmi = metrics.normalized_mutual_info_score(label_true, ypred)
    print >> sys.stderr, ('NMI for deep clustering: %.2f' % (nmi))

    ari = metrics.adjusted_rand_score(label_true, ypred)
    print >> sys.stderr, ('ARI for deep clustering: %.2f' % (ari))
    
    try:
        ac = acc(ypred, label_true)
    except AssertionError:
        ac = 0
        print('Number of predicted cluster mismatch with ground truth.')
        
    print >> sys.stderr, ('ACC for deep clustering: %.2f' % (ac))
    
    config = {'lbd': lbd,   
              'beta': beta,
              'pretraining_epochs': pretraining_epochs,
              'pretrain_lr': pretrain_lr, 
              'mu': mu,
              'finetune_lr': finetune_lr, 
              'training_epochs': training_epochs,
              'dataset': dataset, 
              'batch_size': batch_size, 
              'nClass': nClass, 
              'hidden_dim': hidden_dim}
    results = {'result': res_metrics}
    network = [param.get_value() for param in sdc.params]
    
    package = {'config': config,
               'results': results,
               'network': network}
    with gzip.open(save_file, 'wb') as f:          
        cPickle.dump(package, f, protocol=cPickle.HIGHEST_PROTOCOL)
        
    os.chdir(working_dir)    
    print >> sys.stderr, ('The training code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))   
    
    return res_metrics             
Ejemplo n.º 4
0
filename = 'pendigits.pkl.gz'
path = '/home/bo/Data/Pendigits/'
dataset = path+filename

# perform KM

with gzip.open(dataset, 'rb') as f:
    train_x, train_y = cPickle.load(f)
    
km_model = KMeans(n_clusters = K, n_init = 1) 
results_KM = np.zeros((trials, 3))
for i in range(trials):
    ypred = km_model.fit_predict(train_x)
    nmi = metrics.adjusted_mutual_info_score(train_y, ypred)
    ari = metrics.adjusted_rand_score(train_y, ypred)
    ac  = acc(ypred, train_y)
    results_KM[i] = np.array([nmi, ari, ac])

KM_mean = np.mean(results_KM, axis = 0)
KM_std  = np.std(results_KM, axis = 0)   
# Perform SC    
print('SC started...')
results_SC = np.zeros((trials, 3))
se_model = SpectralEmbedding(n_components=K, affinity='rbf', gamma = 0.1)
se_vec = se_model.fit_transform(train_x)
for i in range(trials):
    ypred = km_model.fit_predict(se_vec)
    nmi = metrics.adjusted_mutual_info_score(train_y, ypred)
    ari = metrics.adjusted_rand_score(train_y, ypred)
    ac  = acc(ypred, train_y)
    results_SC[i] = np.array([nmi, ari, ac])