Example #1
0
def compute_alc(valid_repr, test_repr):
    """
    Returns the ALC of the valid set VS test set
    Note: This proxy won't work in the case of transductive learning
    (This is an assumption) but it seems to be a good proxy in the
    normal case (i.e only train on training set)
    """

    # Concatenate the sets, and give different one hot labels for valid and test
    n_valid = valid_repr.shape[0]
    n_test = test_repr.shape[0]

    _labvalid = numpy.hstack((numpy.ones((n_valid, 1)),
                              numpy.zeros((n_valid, 1))))
    _labtest = numpy.hstack((numpy.zeros((n_test, 1)),
                             numpy.ones((n_test, 1))))

    dataset = numpy.vstack((valid_repr, test_repr))
    label = numpy.vstack((_labvalid, _labtest))

    print '... computing the ALC'
    return embed.score(dataset, label)
Example #2
0
def eval_ALC_test_val(dataset, save_dir_model, save_dir_plot,
    normalize_on_the_fly = False, do_pca = False, type = 'both'):
    """
    Returns the ALC of the valid set VS test set
    Note: This proxy won't work in the case of transductive learning
    (This is an assumption) but it seems to be a good proxy in the
    normal case (i.e only train on training set)
    * type can be erick or yann or both
    so you can check those two ways of computing the ALC get the same results
    """

    if type not in set(['yann', 'erick', 'both']):
        raise('type should be in [yann, erick, both]')
    alc_erick, alc_yann = 10000, 10000

    # load the dataset
    datasets = load_data(dataset, not normalize_on_the_fly, normalize_on_the_fly)
    valid_set_x = datasets[1]
    test_set_x = datasets[2]

    # load the model
    da = dA()
    da.load(save_dir_model)

    # theano functions to get representations of the dataset learned by the model
    index = T.lscalar()    # index to a [mini]batch
    x = theano.tensor.matrix('input')

    get_rep_valid = theano.function([index], da.get_hidden_values(x), updates = {},
        givens = {x:valid_set_x},
        name = 'get_rep_valid')
    get_rep_test = theano.function([index], da.get_hidden_values(x), updates = {},
        givens = {x:test_set_x},
        name = 'get_rep_test')

    # valid and test representations
    valid_rep1 = get_rep_valid(0)
    test_rep1 = get_rep_test(0)

    print numpy.histogram(valid_rep1)
    # TODO: Create submission for *both* PCA'd and non-PCA'd representations?
    if do_pca:
        pca_block = pca.PCA()
        pca_block.load(save_dir_model)
        valid_rep1 = pca_block(valid_rep1)

        pca_block = pca.PCA()
        pca_block.load(save_dir_model)
        test_rep1 = pca_block(test_rep1)

    if type == 'yann' or 'both':
        alc_yann = hebbian_learner(valid_rep1, test_rep1)
        print 'ALC computed by Yann', alc_yann 
    # build the whole dataset and give a one different one hot for each sample
    #from the valid [1,0] VS test [0,1]   
    n_val  = valid_rep1.shape[0]
    n_test = test_rep1.shape[0]

    _labval = numpy.hstack((numpy.ones((n_val,1)), numpy.zeros((n_val,1))))
    _labtest = numpy.hstack((numpy.zeros((n_test,1)), numpy.ones((n_test,1))))
     
    dataset = numpy.vstack((valid_rep1, test_rep1))
    label = numpy.vstack((_labval,_labtest))
    print '... computing the ALC'

    if type == 'erick' or 'both':
        alc_erick = score(dataset, label)
        print 'ALC computed by Erick', alc_erick   

    return alc_erick, alc_yann
Example #3
0
def eval_ALC_test_val(dataset,
                      save_dir_model,
                      save_dir_plot,
                      normalize_on_the_fly=False,
                      do_pca=False,
                      type='both'):
    """
    Returns the ALC of the valid set VS test set
    Note: This proxy won't work in the case of transductive learning
    (This is an assumption) but it seems to be a good proxy in the
    normal case (i.e only train on training set)
    * type can be erick or yann or both
    so you can check those two ways of computing the ALC get the same results
    """

    if type not in set(['yann', 'erick', 'both']):
        raise ('type should be in [yann, erick, both]')
    alc_erick, alc_yann = 10000, 10000

    # load the dataset
    datasets = load_data(dataset, not normalize_on_the_fly,
                         normalize_on_the_fly)
    valid_set_x = datasets[1]
    test_set_x = datasets[2]

    # load the model
    da = dA()
    da.load(save_dir_model)

    # theano functions to get representations of the dataset learned by the model
    index = T.lscalar()  # index to a [mini]batch
    x = theano.tensor.matrix('input')

    get_rep_valid = theano.function([index],
                                    da.get_hidden_values(x),
                                    updates={},
                                    givens={x: valid_set_x},
                                    name='get_rep_valid')
    get_rep_test = theano.function([index],
                                   da.get_hidden_values(x),
                                   updates={},
                                   givens={x: test_set_x},
                                   name='get_rep_test')

    # valid and test representations
    valid_rep1 = get_rep_valid(0)
    test_rep1 = get_rep_test(0)

    print numpy.histogram(valid_rep1)
    # TODO: Create submission for *both* PCA'd and non-PCA'd representations?
    if do_pca:
        pca_block = pca.PCA()
        pca_block.load(save_dir_model)
        valid_rep1 = pca_block(valid_rep1)

        pca_block = pca.PCA()
        pca_block.load(save_dir_model)
        test_rep1 = pca_block(test_rep1)

    if type == 'yann' or 'both':
        alc_yann = hebbian_learner(valid_rep1, test_rep1)
        print 'ALC computed by Yann', alc_yann
    # build the whole dataset and give a one different one hot for each sample
    #from the valid [1,0] VS test [0,1]
    n_val = valid_rep1.shape[0]
    n_test = test_rep1.shape[0]

    _labval = numpy.hstack((numpy.ones((n_val, 1)), numpy.zeros((n_val, 1))))
    _labtest = numpy.hstack((numpy.zeros((n_test, 1)), numpy.ones(
        (n_test, 1))))

    dataset = numpy.vstack((valid_rep1, test_rep1))
    label = numpy.vstack((_labval, _labtest))
    print '... computing the ALC'

    if type == 'erick' or 'both':
        alc_erick = score(dataset, label)
        print 'ALC computed by Erick', alc_erick

    return alc_erick, alc_yann
Example #4
0
def main_train(epochs, batchsize, solution="", sparse_penalty=0, sparsityTarget=0, sparsityTargetPenalty=0):

    # Experiment specific arguments
    conf_dataset = {
        "dataset": "avicenna",
        "expname": "dummy",  # Used to create the submission file
        "transfer": True,
        "normalize": True,  # (Default = True)
        "normalize_on_the_fly": False,  # (Default = False)
        "randomize_valid": True,  # (Default = True)
        "randomize_test": True,  # (Default = True)
        "saving_rate": 0,  # (Default = 0)
        "savedir": "./outputs",
    }

    # First layer = PCA-75 whiten
    pca_layer = {
        "name": "1st-PCA",
        "num_components": 75,
        "min_variance": -50,
        "whiten": True,
        "pca_class": "CovEigPCA",
        # Training properties
        "proba": [1, 0, 0],
        "savedir": "./outputs",
    }

    # Load the dataset
    data = utils.load_data(conf_dataset)

    if conf_dataset["transfer"]:
        # Data for the ALC proxy
        label = data[3]
        data = data[:3]

    # First layer : train or load a PCA
    pca = create_pca(conf_dataset, pca_layer, data, model=pca_layer["name"])
    data = [utils.sharedX(pca.function()(set.get_value(borrow=True)), borrow=True) for set in data]
    """
    if conf_dataset['transfer']:
        data_train, label_train = utils.filter_labels(data[0], label)
      
        alc = embed.score(data_train, label_train)
        print '... resulting ALC on train (for PCA) is', alc
    """

    nvis = utils.get_constant(data[0].shape[1]).item()

    conf = {
        "corruption_level": 0.1,
        "nhid": 200,
        "nvis": nvis,
        "anneal_start": 100,
        "base_lr": 0.001,
        "tied_weights": True,
        "act_enc": "sigmoid",
        "act_dec": None,
        #'lr_hb': 0.10,
        #'lr_vb': 0.10,
        "tied_weights": True,
        "solution": solution,
        "sparse_penalty": sparse_penalty,
        "sparsityTarget": sparsityTarget,
        "sparsityTargetPenalty": sparsityTargetPenalty,
        "irange": 0,
    }

    # A symbolic input representing your minibatch.
    minibatch = tensor.matrix()

    # Allocate a denoising autoencoder with binomial noise corruption.
    corruptor = GaussianCorruptor(conf["corruption_level"])
    da = DenoisingAutoencoder(
        corruptor,
        conf["nvis"],
        conf["nhid"],
        conf["act_enc"],
        conf["act_dec"],
        conf["tied_weights"],
        conf["solution"],
        conf["sparse_penalty"],
        conf["sparsityTarget"],
        conf["sparsityTargetPenalty"],
    )

    # Allocate an optimizer, which tells us how to update our model.
    # TODO: build the cost another way
    cost = SquaredError(da)(minibatch, da.reconstruct(minibatch)).mean()
    trainer = SGDOptimizer(da, conf["base_lr"], conf["anneal_start"])
    updates = trainer.cost_updates(cost)

    # Finally, build a Theano function out of all this.
    train_fn = theano.function([minibatch], cost, updates=updates)

    # Suppose we want minibatches of size 10
    proba = utils.getboth(conf, pca_layer, "proba")
    iterator = BatchIterator(data, proba, batchsize)

    # Here's a manual training loop. I hope to have some classes that
    # automate this a litle bit.
    final_cost = 0
    for epoch in xrange(epochs):
        c = []
        for minibatch_data in iterator:
            minibatch_err = train_fn(minibatch_data)
            c.append(minibatch_err)
        final_cost = numpy.mean(c)
        print "epoch %d, cost : %f" % (epoch, final_cost)

    print "############################## Fin de l'experience ############################"
    print "Calcul de l'ALC : "
    if conf_dataset["transfer"]:
        data_train, label_train = utils.filter_labels(data[0], label)
        alc = embed.score(data_train, label_train)

        print "Solution : ", solution
        print "sparse_penalty = ", sparse_penalty
        print "sparsityTarget = ", sparsityTarget
        print "sparsityTargetPenalty = ", sparsityTargetPenalty
        print "Final denoising error is : ", final_cost
        print "... resulting ALC on train is", alc
        return (alc, final_cost)
Example #5
0
def main_train(epochs, batchsize, solution='',sparse_penalty=0,sparsityTarget=0,sparsityTargetPenalty=0):
    
    # Experiment specific arguments
    conf_dataset = {'dataset' : 'avicenna',
                    'expname' : 'dummy', # Used to create the submission file
                    'transfer' : True,
                    'normalize' : True, # (Default = True)
                    'normalize_on_the_fly' : False, # (Default = False)
                    'randomize_valid' : True, # (Default = True)
                    'randomize_test' : True, # (Default = True)
                    'saving_rate': 0, # (Default = 0)
                    'savedir' : './outputs',
                   }
                   
    # First layer = PCA-75 whiten
    pca_layer = {'name' : '1st-PCA',
                 'num_components': 75,
                 'min_variance': -50,
                 'whiten': True,
                 'pca_class' : 'CovEigPCA',
                 # Training properties
                 'proba' : [1, 0, 0],
                 'savedir' : './outputs',
                }
                                                                                                               
                                                                                                                    
    # Load the dataset
    data = utils.load_data(conf_dataset)
        
    if conf_dataset['transfer']:
    # Data for the ALC proxy
        label = data[3]
        data = data[:3]
        
  
                                    
    # First layer : train or load a PCA
    pca = create_pca(conf_dataset, pca_layer, data, model=pca_layer['name'])
    data = [utils.sharedX(pca.function()(set.get_value(borrow=True)),borrow=True) for set in data]  
    '''
    if conf_dataset['transfer']:
        data_train, label_train = utils.filter_labels(data[0], label)
      
        alc = embed.score(data_train, label_train)
        print '... resulting ALC on train (for PCA) is', alc
    '''                     
                         
                                                                                   
    nvis = utils.get_constant(data[0].shape[1]).item()
  
    conf = {
        'corruption_level': 0.1,
        'nhid': 200,
        'nvis': nvis,
        'anneal_start': 100,
        'base_lr': 0.001, 
        'tied_weights': True,
        'act_enc': 'sigmoid',
        'act_dec': None,
        #'lr_hb': 0.10,
        #'lr_vb': 0.10,
        'tied_weights': True ,
        'solution': solution,
        'sparse_penalty': sparse_penalty,
        'sparsityTarget': sparsityTarget ,
        'sparsityTargetPenalty': sparsityTargetPenalty,
        'irange': 0,
    }

    # A symbolic input representing your minibatch.
    minibatch = tensor.matrix()

    # Allocate a denoising autoencoder with binomial noise corruption.
    corruptor = GaussianCorruptor(conf['corruption_level'])
    da = DenoisingAutoencoder(corruptor, conf['nvis'], conf['nhid'],
                              conf['act_enc'], conf['act_dec'], conf['tied_weights'], conf['solution'], conf['sparse_penalty'],
                              conf['sparsityTarget'], conf['sparsityTargetPenalty'])

    # Allocate an optimizer, which tells us how to update our model.
    # TODO: build the cost another way
    cost = SquaredError(da)(minibatch, da.reconstruct(minibatch)).mean()
    trainer = SGDOptimizer(da, conf['base_lr'], conf['anneal_start'])
    updates = trainer.cost_updates(cost)

    # Finally, build a Theano function out of all this.
    train_fn = theano.function([minibatch], cost, updates=updates)

    # Suppose we want minibatches of size 10
    proba = utils.getboth(conf, pca_layer, 'proba')    
    iterator = BatchIterator(data, proba, batchsize)
    
    # Here's a manual training loop. I hope to have some classes that
    # automate this a litle bit.
    final_cost = 0
    for epoch in xrange(epochs):
        c = []
        for minibatch_data in iterator:
            minibatch_err = train_fn(minibatch_data)
            c.append(minibatch_err)
        final_cost = numpy.mean(c)
        print "epoch %d, cost : %f" % (epoch , final_cost)
        

    print '############################## Fin de l\'experience ############################'
    print 'Calcul de l\'ALC : '
    if conf_dataset['transfer']:
        data_train, label_train = utils.filter_labels(data[0], label)
        alc = embed.score(data_train, label_train)
        
        print 'Solution : ',solution
        print 'sparse_penalty = ',sparse_penalty
        print 'sparsityTarget = ',sparsityTarget
        print 'sparsityTargetPenalty = ',sparsityTargetPenalty
        print 'Final denoising error is : ',final_cost 
        print '... resulting ALC on train is', alc    
        return (alc,final_cost)
Example #6
0
def main_train(epochs,
               batchsize,
               solution='',
               sparse_penalty=0,
               sparsityTarget=0,
               sparsityTargetPenalty=0):

    # Experiment specific arguments
    conf_dataset = {
        'dataset': 'avicenna',
        'expname': 'dummy',  # Used to create the submission file
        'transfer': True,
        'normalize': True,  # (Default = True)
        'normalize_on_the_fly': False,  # (Default = False)
        'randomize_valid': True,  # (Default = True)
        'randomize_test': True,  # (Default = True)
        'saving_rate': 0,  # (Default = 0)
        'savedir': './outputs',
    }

    # First layer = PCA-75 whiten
    pca_layer = {
        'name': '1st-PCA',
        'num_components': 75,
        'min_variance': -50,
        'whiten': True,
        'pca_class': 'CovEigPCA',
        # Training properties
        'proba': [1, 0, 0],
        'savedir': './outputs',
    }

    # Load the dataset
    data = utils.load_data(conf_dataset)

    if conf_dataset['transfer']:
        # Data for the ALC proxy
        label = data[3]
        data = data[:3]

    # First layer : train or load a PCA
    pca = create_pca(conf_dataset, pca_layer, data, model=pca_layer['name'])
    data = [
        utils.sharedX(pca.function()(set.get_value(borrow=True)), borrow=True)
        for set in data
    ]
    '''
    if conf_dataset['transfer']:
        data_train, label_train = utils.filter_labels(data[0], label)
      
        alc = embed.score(data_train, label_train)
        print '... resulting ALC on train (for PCA) is', alc
    '''

    nvis = utils.get_constant(data[0].shape[1]).item()

    conf = {
        'corruption_level': 0.1,
        'nhid': 200,
        'nvis': nvis,
        'anneal_start': 100,
        'base_lr': 0.001,
        'tied_weights': True,
        'act_enc': 'sigmoid',
        'act_dec': None,
        #'lr_hb': 0.10,
        #'lr_vb': 0.10,
        'tied_weights': True,
        'solution': solution,
        'sparse_penalty': sparse_penalty,
        'sparsityTarget': sparsityTarget,
        'sparsityTargetPenalty': sparsityTargetPenalty,
        'irange': 0,
    }

    # A symbolic input representing your minibatch.
    minibatch = tensor.matrix()

    # Allocate a denoising autoencoder with binomial noise corruption.
    corruptor = GaussianCorruptor(conf['corruption_level'])
    da = DenoisingAutoencoder(corruptor, conf['nvis'], conf['nhid'],
                              conf['act_enc'], conf['act_dec'],
                              conf['tied_weights'], conf['solution'],
                              conf['sparse_penalty'], conf['sparsityTarget'],
                              conf['sparsityTargetPenalty'])

    # Allocate an optimizer, which tells us how to update our model.
    # TODO: build the cost another way
    cost = SquaredError(da)(minibatch, da.reconstruct(minibatch)).mean()
    trainer = SGDOptimizer(da, conf['base_lr'], conf['anneal_start'])
    updates = trainer.cost_updates(cost)

    # Finally, build a Theano function out of all this.
    train_fn = theano.function([minibatch], cost, updates=updates)

    # Suppose we want minibatches of size 10
    proba = utils.getboth(conf, pca_layer, 'proba')
    iterator = BatchIterator(data, proba, batchsize)

    # Here's a manual training loop. I hope to have some classes that
    # automate this a litle bit.
    final_cost = 0
    for epoch in xrange(epochs):
        c = []
        for minibatch_data in iterator:
            minibatch_err = train_fn(minibatch_data)
            c.append(minibatch_err)
        final_cost = numpy.mean(c)
        print "epoch %d, cost : %f" % (epoch, final_cost)

    print '############################## Fin de l\'experience ############################'
    print 'Calcul de l\'ALC : '
    if conf_dataset['transfer']:
        data_train, label_train = utils.filter_labels(data[0], label)
        alc = embed.score(data_train, label_train)

        print 'Solution : ', solution
        print 'sparse_penalty = ', sparse_penalty
        print 'sparsityTarget = ', sparsityTarget
        print 'sparsityTargetPenalty = ', sparsityTargetPenalty
        print 'Final denoising error is : ', final_cost
        print '... resulting ALC on train is', alc
        return (alc, final_cost)
Example #7
0
        if saving_rate != 0:
            saving_counter += 1
            if saving_counter % saving_rate == 0:
                rbm.save(
                    os.path.join(savedir,
                                 layer['name'] + '-epoch-%02d.pkl' % epoch))

                ## Yes, this is a hack
                if label is not None:
                    # Compute ALC on train
                    data_train_repr = utils.minibatch_map(
                        rbm.function(),
                        layer['batch_size'],
                        data_train,
                        output_width=layer['nhid'])
                    alc = embed.score(data_train_repr, label_train)
                    print '... train ALC at epoch %d: %f' % (epoch, alc)

    end_time = time.clock()
    layer['training_time'] = (end_time - start_time) / 60.
    print '... training ended after %f min' % layer['training_time']

    # Compute reconstruction error for valid and train data sets
    error_fn = theano.function([minibatch], proxy_cost, name='error_fn')
    layer['error_valid'] = error_fn(data[1].get_value(borrow=True)).item()
    layer['error_test'] = error_fn(data[2].get_value(borrow=True)).item()
    print '... final error with valid is', layer['error_valid']
    print '... final error with test  is', layer['error_test']

    # Save model parameters
    rbm.save(filename)
Example #8
0
        # Saving intermediate models
        if saving_rate != 0:
            saving_counter += 1
            if saving_counter % saving_rate == 0:
                rbm.save(os.path.join(savedir,
                    layer['name'] + '-epoch-%02d.pkl' % epoch))

                ## Yes, this is a hack
                if label is not None:
                    # Compute ALC on train
                    data_train_repr = utils.minibatch_map(
                            rbm.function(),
                            layer['batch_size'],
                            data_train,
                            output_width=layer['nhid'])
                    alc = embed.score(data_train_repr, label_train)
                    print '... train ALC at epoch %d: %f' % (epoch, alc)

    end_time = time.clock()
    layer['training_time'] = (end_time - start_time) / 60.
    print '... training ended after %f min' % layer['training_time']

    # Compute reconstruction error for valid and train data sets
    error_fn = theano.function([minibatch], proxy_cost, name='error_fn')
    layer['error_valid'] = error_fn(data[1].get_value(borrow=True)).item()
    layer['error_test'] = error_fn(data[2].get_value(borrow=True)).item()
    print '... final error with valid is', layer['error_valid']
    print '... final error with test  is', layer['error_test']

    # Save model parameters
    rbm.save(filename)