load_params_pr = False
	

song_size = 2459

#filepath for saving parameters
savefilename = '/vega/stats/users/sl3368/rnn_code/saves/params/neural/dual_single/'+brain_region+'_'+str(held_out_song)+'_'+str(neuron)+'.save'

neurons_savefilename ='/vega/stats/users/sl3368/rnn_code/saves/params/neural/dual_single/'+brain_region+'_'+str(held_out_song)+'_'+str(neuron)+'.save_neurons'

psth_savefilename ='/vega/stats/users/sl3368/rnn_code/saves/params/psth/dual_single/'+brain_region+'_'+str(held_out_song)+'_'+str(neuron)+'.psth'

################################################
# Load Data
################################################
dataset_info = load_all_data()
stim = dataset_info[0]
data_set_x = theano.shared(stim, borrow=True)

n_batches = data_set_x.shape[0].eval()/song_size

n_train_batches = n_batches 

print 'Number of songs in single matlab chunk: '+str(n_train_batches)

print 'Getting neural data...'

neural_data = load_neural_data()

ntrials = theano.shared(neural_data[brain_region_index],borrow=True)
responses = theano.shared(neural_data[brain_region_index+1],borrow=True)
song_size = 2459

#filepath for saving parameters
savefilename = '/vega/stats/users/sl3368/CLDNN/saves/neural/shallow/' + brain_region + '_' + str(
    held_out_song) + '.save'

neurons_savefilename = '/vega/stats/users/sl3368/CLDNN/saves/neural/shallow/' + brain_region + '_' + str(
    held_out_song) + '.save_neurons'

psth_savefilename = '/vega/stats/users/sl3368/CLDNN/saves/psth/shallow/' + brain_region + '_' + str(
    held_out_song) + '.psth'

################################################
# Load Data
################################################
dataset_info = load_all_data()
stim = dataset_info[0]
data_set_x = theano.shared(stim, borrow=True)

n_batches = data_set_x.shape[0].eval() / song_size

n_train_batches = n_batches

print 'Number of songs in single matlab chunk: ' + str(n_train_batches)

print 'Getting neural data...'

neural_data = load_neural_data()

ntrials = theano.shared(neural_data[brain_region_index], borrow=True)
responses = theano.shared(neural_data[brain_region_index + 1], borrow=True)
Example #3
0
def SGD_training(learning_rate=1, n_epochs=1000):
    """
    stochastic gradient descent optimization

   """
    dataset_info = load_all_data()

    data_set_x = dataset_info[0]

    maxBatchSize = numpy.int_(dataset_info[1])

    batch_size = maxBatchSize
    n_train_batches = 28
    #n_valid_batches = 1
    #n_test_batches = 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as a vector of inputs with many exchangeable examples of this vector
    x = clip_gradient(x,1.0)     
    y = T.matrix('y')  # the data is presented as a vector of inputs with many exchangeable examples of this vector
    
    is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction
    
    rng = numpy.random.RandomState(1234)

    ################################################
    # Architecture: input --> LSTM --> predict one-ahead
    ################################################

    # The poisson regression layer gets as input the hidden units
    # of the hidden layer
    d_input = Dropout(rng, is_train, x)
    n_hidden = 100
    lstm_1 = LSTM(rng, d_input.output, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden)
    #lstm_1 = RNN(rng, d_input.output, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden) #vanilla rnn
    d_lstm_1 = Dropout(rng, is_train, lstm_1.output)
    output = LinearRegression(input=d_lstm_1.output, n_in=n_hidden, n_out=data_set_x.get_value(borrow=True).shape[1])

    #######################
    # Objective function
    #######################
    print '... defining objective and compiling test and validate'

    # the cost we minimize during training is the negative log likelihood of
    # the model 
    cost = T.mean(output.negative_log_likelihood(y))

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    # use cost or errors(y,tc,md) as output?
    test_model = theano.function(inputs=[index],
            outputs=[cost, output.E_y_given_x],
            givens={
                x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)],
                y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size],
                is_train: numpy.cast['int32'](0)})

    # wanted to use below indexes and have different sized batches, but this didn't work
    #[int(batchBreaks[index]-1):int(batchBreaks[(index+1)]-1)]

    validate_model = theano.function(inputs=[index],
            outputs=cost,
            givens={
                x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)],
                y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size],
                is_train: numpy.cast['int32'](0)})

    #######################
    # Parameters and gradients
    #######################
    print '... parameters and gradients'

    # create a list (concatenated) of all model parameters to be fit by gradient descent
    #order: [self.W, self.b]
    params = lstm_1.params + output.params
    params_helper = lstm_1.params_helper + output.params_helper
    params_helper2 = lstm_1.params_helper2 + output.params_helper2

    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = []
    for param in params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs
    updates = []
    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
    # same length, zip generates a list C of same size, where each element
    # is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    #for param, gparam in zip(params, gparams):
    #    updates.append((param, param - learning_rate * gparam))
    #iter_count = theano.shared(1)
    #L1_penalized = []
    #larger_stepsize = []
    #enforce_positive = [2, 3] #if recurrent
    #enforce_positive = []
    #zero_stepsize = []
    param_index = 0
    #rho = 1e-6
    #for param, param_helper, param_helper2, gparam in zip(params, params_helper, params_helper2, gparams):
        #updates.append((param_helper, param_helper + gparam ** 2)) #need sum of squares for learning rate
        #updates.append((param_helper2, param_helper2 + gparam)) #need sum of gradients for L1 thresholding
    
    #vanilla SGD
    #for param, gparam in zip(params, gparams):
    #    updates.append((param, param - learning_rate * gparam))
    #    param_index += 1

    #adadelta updates
    rho = .95
    eps_big = 1e-6
    for param, param_helper, param_helper2, gparam in zip(params, params_helper, params_helper2, gparams):
        updates.append((param_helper,rho * param_helper + (1. - rho) * (gparam ** 2))) #update decaying sum of previous gradients
        dparam = - T.sqrt((param_helper2 + eps_big) / (rho * param_helper + (1. - rho) * (gparam ** 2) + eps_big)) *gparam # calculate step size
        updates.append((param_helper2, rho * param_helper2 + (1. - rho) * (dparam ** 2))) #update decaying sum of previous step sizes
        updates.append((param, param + dparam))
    
    #updates.append((iter_count, iter_count + 1))

    print '... compiling train'
    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(inputs=[index], outputs=cost,
            updates=updates,
            givens={
                x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)],
                y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size],
                is_train: numpy.cast['int32'](0)})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    #patience = train_set_x.get_value(borrow=True).shape[0] * n_epochs #no early stopping
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.99  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    #best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    #test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            print minibatch_avg_cost
 
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute absolute error loss on validation set
                validation_losses = [validate_model(i) for i
                                     in [28]]
                this_validation_loss = numpy.mean(validation_losses) #mean over batches
                print('epoch %i, minibatch %i, validation error %f' %
                     (epoch, minibatch_index + 1,
                      this_validation_loss))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    #test_losses = [test_model(i) for i
                    #               in [29]]
                    #test_score = numpy.mean(test_losses)
                    test_cost, test_pred = test_model(29)
                    #test_cost, test_costs_separate, test_pred_separate, test_actual_separate = test_model(29)

                    print(('     epoch %i, minibatch %i, test error of '
                           'best model %f') %
                          (epoch, minibatch_index + 1,
                           numpy.sum(test_cost)))

            if patience <= iter:
                    done_looping = True
                    break

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f'
           'obtained at iteration %i, with test performance %f') %
          (best_validation_loss, best_iter + 1, numpy.sum(test_cost)))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
	
    #store data
    f = file('results/params.save', 'wb')
    for obj in [params + [test_cost] + [test_pred]]:
        cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
    f.close()
Example #4
0
def SGD_training(learning_rate=1, n_epochs=1000):
    """
    stochastic gradient descent optimization

   """
    dataset_info = load_all_data()

    data_set_x = dataset_info[0]

    maxBatchSize = numpy.int_(dataset_info[1])

    batch_size = maxBatchSize
    n_train_batches = 28
    #n_valid_batches = 1
    #n_test_batches = 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix(
        'x'
    )  # the data is presented as a vector of inputs with many exchangeable examples of this vector
    x = clip_gradient(x, 1.0)
    y = T.matrix(
        'y'
    )  # the data is presented as a vector of inputs with many exchangeable examples of this vector

    is_train = T.iscalar(
        'is_train'
    )  # pseudo boolean for switching between training and prediction

    rng = numpy.random.RandomState(1234)

    ################################################
    # Architecture: input --> LSTM --> predict one-ahead
    ################################################

    # The poisson regression layer gets as input the hidden units
    # of the hidden layer
    d_input = Dropout(rng, is_train, x)
    n_hidden = 100
    lstm_1 = LSTM(rng,
                  d_input.output,
                  n_in=data_set_x.get_value(borrow=True).shape[1],
                  n_out=n_hidden)
    #lstm_1 = RNN(rng, d_input.output, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden) #vanilla rnn
    d_lstm_1 = Dropout(rng, is_train, lstm_1.output)
    output = LinearRegression(input=d_lstm_1.output,
                              n_in=n_hidden,
                              n_out=data_set_x.get_value(borrow=True).shape[1])

    #######################
    # Objective function
    #######################
    print '... defining objective and compiling test and validate'

    # the cost we minimize during training is the negative log likelihood of
    # the model
    cost = T.mean(output.negative_log_likelihood(y))

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    # use cost or errors(y,tc,md) as output?
    test_model = theano.function(
        inputs=[index],
        outputs=[cost, output.E_y_given_x],
        givens={
            x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)],
            y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size],
            is_train: numpy.cast['int32'](0)
        })

    # wanted to use below indexes and have different sized batches, but this didn't work
    #[int(batchBreaks[index]-1):int(batchBreaks[(index+1)]-1)]

    validate_model = theano.function(
        inputs=[index],
        outputs=cost,
        givens={
            x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)],
            y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size],
            is_train: numpy.cast['int32'](0)
        })

    #######################
    # Parameters and gradients
    #######################
    print '... parameters and gradients'

    # create a list (concatenated) of all model parameters to be fit by gradient descent
    #order: [self.W, self.b]
    params = lstm_1.params + output.params
    params_helper = lstm_1.params_helper + output.params_helper
    params_helper2 = lstm_1.params_helper2 + output.params_helper2

    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = []
    for param in params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs
    updates = []
    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
    # same length, zip generates a list C of same size, where each element
    # is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    #for param, gparam in zip(params, gparams):
    #    updates.append((param, param - learning_rate * gparam))
    #iter_count = theano.shared(1)
    #L1_penalized = []
    #larger_stepsize = []
    #enforce_positive = [2, 3] #if recurrent
    #enforce_positive = []
    #zero_stepsize = []
    param_index = 0
    #rho = 1e-6
    #for param, param_helper, param_helper2, gparam in zip(params, params_helper, params_helper2, gparams):
    #updates.append((param_helper, param_helper + gparam ** 2)) #need sum of squares for learning rate
    #updates.append((param_helper2, param_helper2 + gparam)) #need sum of gradients for L1 thresholding

    #vanilla SGD
    #for param, gparam in zip(params, gparams):
    #    updates.append((param, param - learning_rate * gparam))
    #    param_index += 1

    #adadelta updates
    rho = .95
    eps_big = 1e-6
    for param, param_helper, param_helper2, gparam in zip(
            params, params_helper, params_helper2, gparams):
        updates.append(
            (param_helper, rho * param_helper + (1. - rho) *
             (gparam**2)))  #update decaying sum of previous gradients
        dparam = -T.sqrt(
            (param_helper2 + eps_big) /
            (rho * param_helper + (1. - rho) *
             (gparam**2) + eps_big)) * gparam  # calculate step size
        updates.append(
            (param_helper2, rho * param_helper2 + (1. - rho) *
             (dparam**2)))  #update decaying sum of previous step sizes
        updates.append((param, param + dparam))

    #updates.append((iter_count, iter_count + 1))

    print '... compiling train'
    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)],
            y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size],
            is_train: numpy.cast['int32'](0)
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    #patience = train_set_x.get_value(borrow=True).shape[0] * n_epochs #no early stopping
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.99  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    #best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    #test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            print minibatch_avg_cost

            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute absolute error loss on validation set
                validation_losses = [validate_model(i) for i in [28]]
                this_validation_loss = numpy.mean(
                    validation_losses)  #mean over batches
                print('epoch %i, minibatch %i, validation error %f' %
                      (epoch, minibatch_index + 1, this_validation_loss))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    #test_losses = [test_model(i) for i
                    #               in [29]]
                    #test_score = numpy.mean(test_losses)
                    test_cost, test_pred = test_model(29)
                    #test_cost, test_costs_separate, test_pred_separate, test_actual_separate = test_model(29)

                    print(('     epoch %i, minibatch %i, test error of '
                           'best model %f') %
                          (epoch, minibatch_index + 1, numpy.sum(test_cost)))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f'
           'obtained at iteration %i, with test performance %f') %
          (best_validation_loss, best_iter + 1, numpy.sum(test_cost)))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    #store data
    f = file('results/params.save', 'wb')
    for obj in [params + [test_cost] + [test_pred]]:
        cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
    f.close()
def generate_visualization_on_section(region,heldout,neuron_no,training_iterations,hidden,song_section,previous):
    
    region_dict = {'L1':0,'L2':2,'L3':4,'NC':6,'MLd':8}
    held_out_song = heldout
    brain_region = region
    brain_region_index = region_dict[brain_region]
    neuron = neuron_no
    prev_trained_size = previous.shape[0]
    n_epochs= training_iterations
    n_hidden = hidden
    song_depth = song_section
    
    print 'Running CV for held out song '+str(held_out_song)+' for brain region '+brain_region+' index at '+str(brain_region_index)+' iteration:'+str(song_depth)
    
    #Filepath for printing results
    results_filename='/vega/stats/users/sl3368/rnn_code/results/neural/dual_'+str(n_hidden)+'/visualizations/'+brain_region+'_'+str(held_out_song)+'_'+str(neuron)+'.out'
    #check if exists already, then load or not load 
    load_params_pr_filename = '/vega/stats/users/sl3368/rnn_code/saves/params/neural/dual_'+str(n_hidden)+'/'+brain_region+'_'+str(held_out_song)+'.save'
    if path.isfile(load_params_pr_filename):
        #print 'Will load previous regression parameters...'
        load_params_pr = True
    else:
        load_params_pr = False
    	
    song_size = 2459
    
    #filepath for saving parameters
    savefilename = '/vega/stats/users/sl3368/rnn_code/saves/params/neural/dual_'+str(n_hidden)+'/visualizations/'+brain_region+'_'+str(held_out_song)+'_'+str(neuron)+'.visualization'
    
    if path.isfile(savefilename):
        load_visualize = True
    else:
        load_visualize = False
    
    
    ################################################
    # Load Data
    ################################################
    dataset_info = load_all_data()
    stim = dataset_info[0]
    data_set_x = theano.shared(stim, borrow=True)
    
    #print 'Getting neural data...'
    
    neural_data = load_neural_data()
    
    ntrials = theano.shared(neural_data[brain_region_index],borrow=True)
    responses = theano.shared(neural_data[brain_region_index+1],borrow=True)
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    
    #print 'building the model...'
    
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    section = T.lscalar()
    
    init = numpy.zeros((song_depth,60)).astype('f')
    init[:prev_trained_size] = previous
    x = shared(init,borrow=True)
    
    y = T.matrix('y')  # the data is presented as a vector of inputs with many exchangeable examples of this vector
    trial_no = T.matrix('trial_no')
    
    rng = numpy.random.RandomState(1234)
    
    # Architecture: input --> LSTM --> predict one-ahead
     
    lstm_1 = LSTM(rng, x, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden)
   
    output = PoissonRegression(input=lstm_1.output, n_in=n_hidden, n_out=1)
    
    pred = output.E_y_given_x.T * trial_no[:,neuron]
    nll = output.negative_log_likelihood(y[:,neuron],trial_no[:,neuron],single=True)
    
    ################################
    # Objective function and GD
    ################################
    
    #print 'defining cost, parameters, and learning function...'
    
    # the cost we minimize during training is the negative log likelihood of
    # the model 
    cost = T.mean(nll)
    
    #Defining params
    params = [x]
    
    # updates from ADAM
    updates = Adam(cost, params)
    
    #######################
    # Objective function
    #######################
    
    #print 'compiling train....'
    
    train_model = theano.function(inputs=[index,section], outputs=cost,
            updates=updates,
            givens={
    	        trial_no: ntrials[index * song_size:((index * song_size)+section)],
                y: responses[index * song_size:((index * song_size)+section)]})
    
    #validate_model = theano.function(inputs=[index,section],
    #        outputs=[cost,nll.shape,pred.shape],
    #        givens={
    #	        trial_no: ntrials[index * song_size:((index * song_size)+section)],
    #            y: responses[index * song_size:((index * song_size)+section)]})
    
    #######################
    # Parameters and gradients
    #######################
    #print 'parameters and gradients...'
    
    if load_params_pr:
        #print 'loading LSTM parameters from file...'
        f = open( load_params_pr_filename)
        old_p = cPickle.load(f)
	lstm_1.W_i.set_value(old_p[0].get_value(), borrow=True)
        lstm_1.W_f.set_value(old_p[1].get_value(), borrow=True)
        lstm_1.W_c.set_value(old_p[2].get_value(), borrow=True)
        lstm_1.W_o.set_value(old_p[3].get_value(), borrow=True)
        lstm_1.U_i.set_value(old_p[4].get_value(), borrow=True)
        lstm_1.U_f.set_value(old_p[5].get_value(), borrow=True)
        lstm_1.U_c.set_value(old_p[6].get_value(), borrow=True)
        lstm_1.U_o.set_value(old_p[7].get_value(), borrow=True)
        lstm_1.V_o.set_value(old_p[8].get_value(), borrow=True)
        lstm_1.b_i.set_value(old_p[9].get_value(), borrow=True)
        lstm_1.b_f.set_value(old_p[10].get_value(), borrow=True)
        lstm_1.b_c.set_value(old_p[11].get_value(), borrow=True)
        lstm_1.b_o.set_value(old_p[12].get_value(), borrow=True)
        f.close()
    
    if load_params_pr:
        #print 'loading PR parameters from file...'
        f = open( load_params_pr_filename)
        old_p = cPickle.load(f)
        output.W.set_value(old_p[13].get_value(), borrow=True)
        output.b.set_value(old_p[14].get_value(), borrow=True)
        f.close()
    
    #if load_visualize:
    #    print 'loading visualization from file...'
    #    f = open(savefilename)
    #    old_v = cPickle.load(f)
    #    x.set_value(old_v, borrow=True)
    #    f.close()
    
    ###############
    # TRAIN MODEL #
    ###############
    #print 'training...'
    
    best_validation_loss = numpy.inf
    epoch = 0
    
    last_e = time.time()
    
    r_log=open(results_filename,'a')
    r_log.write('Starting training...\n')
    r_log.close()
    
    while (epoch < n_epochs):
        #print str(epoch)+' epoch took: '+str(time.time()-last_e)
        #r_log=open(results_filename, 'a')
        #r_log.write(str(epoch)+ ' epoch took: '+str(time.time()-last_e)+'\n')
        #r_log.close()
    
        last_e = time.time()
        epoch = epoch + 1
    
        mb_costs = []
    
        heldout = 0
    
        for minibatch_index in xrange(14):
            if(heldout==held_out_song):
                minibatch_avg_cost = train_model(minibatch_index,song_depth)
                #print minibatch_avg_cost
    	    	mb_costs.append(minibatch_avg_cost)
    	    heldout=heldout+1
    
        for minibatch_index in xrange(24,30):
            if(heldout==held_out_song):
    	        minibatch_avg_cost = train_model(minibatch_index,song_depth)
                #print minibatch_avg_cost
    	    	mb_costs.append(minibatch_avg_cost)
    	    heldout=heldout+1
    
        avg_cost = numpy.mean(mb_costs)
    
        
        # if we got the best validation score until now
        if avg_cost < best_validation_loss:
    	    
   	    best_validation_loss = avg_cost
            visualization = numpy.asarray(x.eval())
            
	    #store data
            f = file(savefilename, 'wb')
            for obj in [visualization]:
                cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
            f.close()
            
    print('epoch %i, training error %f' %  (epoch, avg_cost))
    
    r_log=open(results_filename, 'a')
    r_log.write('epoch %i, training error %f' %  (epoch, avg_cost))
    r_log.close()
    
    return numpy.asarray(x.eval())