def main(): # Hyper parameters. optimizer = 'lbfgs' # or use: ncg, lbfgs, rmsprop batch_size = 10000 flat, (w, b) = climin.util.empty_with_views(tmpl) climin.initialize.randomize_normal(flat, 0, 0.1) datafile = 'mnist.pkl.gz' # Load data. with gzip.open(datafile, 'rb') as f: train_set, val_set, test_set = cPickle.load(f) X, Z = train_set VX, VZ = val_set TX, TZ = test_set def one_hot(arr): result = np.zeros((arr.shape[0], 10)) result[xrange(arr.shape[0]), arr] = 1. return result Z = one_hot(Z) VZ = one_hot(VZ) TZ = one_hot(TZ) if batch_size is None: args = itertools.repeat(([X, Z], {})) batches_per_pass = 1 else: args = climin.util.iter_minibatches([X, Z], batch_size, [0, 0]) args = ((i, {}) for i in args) batches_per_pass = X.shape[0] / batch_size if optimizer == 'gd': opt = climin.GradientDescent(flat, d_loss_wrt_pars, steprate=0.1, momentum=.95, args=args) elif optimizer == 'lbfgs': opt = climin.Lbfgs(flat, loss, d_loss_wrt_pars, args=args) elif optimizer == 'ncg': opt = climin.NonlinearConjugateGradient(flat, loss, d_loss_wrt_pars, args=args) elif optimizer == 'rmsprop': opt = climin.RmsProp(flat, d_loss_wrt_pars, steprate=1e-4, decay=0.9, args=args) elif optimizer == 'rprop': opt = climin.Rprop(flat, d_loss_wrt_pars, args=args) else: print 'unknown optimizer' return 1 for info in opt: if info['n_iter'] % batches_per_pass == 0: print '%i/%i test loss: %g' % ( info['n_iter'], batches_per_pass * 10, loss(flat, VX, VZ)) if info['n_iter'] >= 10 * batches_per_pass: break
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=500, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500, optimizer='gd', activation=T.tanh): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] tmpl = [(28 * 28, n_hidden), n_hidden, (n_hidden, 10), 10] flat, (Weights_1, bias_1, Weights_2, bias_2) = climin.util.empty_with_views(tmpl) #Initialize weights with uniformal distribution according to the tutorial rng = numpy.random.RandomState(1234) Weights_1_init = rng.uniform(low=-numpy.sqrt(6. / (28 * 28 + n_hidden)), high=numpy.sqrt(6. / (28 * 28 + n_hidden)), size=(28 * 28, n_hidden)) Weights_2_init = rng.uniform(low=-numpy.sqrt(6. / (n_hidden + 10)), high=numpy.sqrt(6. / (n_hidden + 10)), size=(n_hidden, 10)) bias_1_init = numpy.zeros((n_hidden, ), dtype=theano.config.floatX) bias_2_init = numpy.zeros((10, ), dtype=theano.config.floatX) if activation == T.nnet.sigmoid: Weights_1_init *= 4 Weights_2_init *= 4 def initialize_in_place(array, values): for j in range(0, len(values)): array[j] = values[j] initialize_in_place(Weights_1, Weights_1_init) initialize_in_place(Weights_2, Weights_2_init) initialize_in_place(bias_1, bias_1_init) initialize_in_place(bias_2, bias_2_init) if batch_size is None: args = itertools.repeat(([train_set_x, train_set_y], {})) n_train_batches = 1 else: args = cli.util.iter_minibatches([train_set_x, train_set_y], batch_size, [0, 0]) args = ((i, {}) for i in args) n_train_batches = train_set_x.shape[0] // batch_size print('... building the model') x = T.matrix('x') y = T.ivector('y') rng = numpy.random.RandomState(1234) classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10, Weights_1=theano.shared(value=Weights_1, name='W', borrow=True), bias_1=theano.shared(value=bias_1, name='b', borrow=True), Weights_2=theano.shared(value=Weights_2, name='W', borrow=True), bias_2=theano.shared(value=bias_2, name='b', borrow=True), activation=T.tanh) #cost with regularisation terms cost = theano.function(inputs=[x, y], outputs=classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, allow_input_downcast=True) # gradients with regularisation terms gradients = theano.function( inputs=[x, y], outputs=[ T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.hiddenLayer.W), T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.hiddenLayer.b), T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.logRegressionLayer.W), T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.logRegressionLayer.b) ], allow_input_downcast=True) def loss(parameters, input, target): return cost(input, target) def d_loss_wrt_pars(parameters, inputs, targets): g_W_1, g_b_1, g_W_2, g_b_2 = gradients(inputs, targets) return numpy.concatenate( [g_W_1.flatten(), g_b_1, g_W_2.flatten(), g_b_2]) zero_one_loss = theano.function(inputs=[x, y], outputs=classifier.errors(y), allow_input_downcast=True) if optimizer == 'gd': print('... using gradient descent') opt = cli.GradientDescent(flat, d_loss_wrt_pars, step_rate=learning_rate, momentum=.95, args=args) elif optimizer == 'bfgs': print('... using using quasi-newton BFGS') opt = cli.Bfgs(flat, loss, d_loss_wrt_pars, args=args) elif optimizer == 'lbfgs': print('... using using quasi-newton L-BFGS') opt = cli.Lbfgs(flat, loss, d_loss_wrt_pars, args=args) elif optimizer == 'nlcg': print('... using using non linear conjugate gradient') opt = cli.NonlinearConjugateGradient(flat, loss, d_loss_wrt_pars, min_grad=1e-03, args=args) elif optimizer == 'rmsprop': print('... using rmsprop') opt = cli.RmsProp(flat, d_loss_wrt_pars, step_rate=1e-4, decay=0.9, args=args) elif optimizer == 'rprop': print('... using resilient propagation') opt = cli.Rprop(flat, d_loss_wrt_pars, args=args) elif optimizer == 'adam': print('... using adaptive momentum estimation optimizer') opt = cli.Adam(flat, d_loss_wrt_pars, step_rate=0.0002, decay=0.99999999, decay_mom1=0.1, decay_mom2=0.001, momentum=0, offset=1e-08, args=args) elif optimizer == 'adadelta': print('... using adadelta') opt = cli.Adadelta(flat, d_loss_wrt_pars, step_rate=1, decay=0.9, momentum=.95, offset=0.0001, args=args) else: print('unknown optimizer') return 1 print('... training') # early stopping parameters if batch_size == None: patience = 250 else: patience = 10000 # look at this many samples regardless patience_increase = 2 # wait this mutch longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this mutch is considered signigicant validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = numpy.inf test_loss = 0. valid_losses = [] train_losses = [] test_losses = [] epoch = 0 start_time = timeit.default_timer() for info in opt: iter = info['n_iter'] epoch = iter // n_train_batches minibatch_index = iter % n_train_batches if iter % validation_frequency == 0: validation_loss = zero_one_loss(valid_set_x, valid_set_y) valid_losses.append(validation_loss) train_losses.append(zero_one_loss(train_set_x, train_set_y)) test_losses.append(zero_one_loss(test_set_x, test_set_y)) print( 'epoch %i, minibatch %i/%i, validation error % f %%, iter/patience %i/%i' % (epoch, minibatch_index + 1, n_train_batches, validation_loss * 100, iter, patience)) # if we got the best validation score until now if validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = validation_loss # test it on the test set test_loss = zero_one_loss(test_set_x, test_set_y) print( ' epoch %i, minibatch %i/%i, test error of best model %f %%' % (epoch, minibatch_index + 1, n_train_batches, test_loss * 100)) if patience <= iter or epoch >= n_epochs: break end_time = timeit.default_timer() print(( 'Optimization complete. Best validation score of %f %% with test performance %f %%' ) % (best_validation_loss * 100., test_loss * 100.)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) losses = (train_losses, valid_losses, test_losses) return classifier, losses
def sgd_optimization_mnist(learning_rate=0.01, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600, optimizer='gd'): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] tmpl = [(28 * 28, 10), 10] flat, (Weights, bias) = climin.util.empty_with_views(tmpl) cli.initialize.randomize_normal(flat, 0, 1) if batch_size is None: args = itertools.repeat(([train_set_x, train_set_y], {})) n_train_batches = 1 else: args = cli.util.iter_minibatches([train_set_x, train_set_y], batch_size, [0, 0]) args = ((i, {}) for i in args) n_train_batches = train_set_x.shape[0] // batch_size print('... building the model') x = T.matrix('x') y = T.ivector('y') classifier = LogisticRegression( input = x, n_in = 28 * 28, n_out = 10, W = theano.shared(value = Weights, name = 'W', borrow = True), b = theano.shared(value = bias, name = 'b', borrow = True) ) gradients = theano.function( inputs = [x, y], outputs = [ T.grad(classifier.negative_log_likelihood(y), classifier.W), T.grad(classifier.negative_log_likelihood(y), classifier.b) ], allow_input_downcast = True ) cost = theano.function( inputs=[x, y], outputs=classifier.negative_log_likelihood(y), allow_input_downcast=True ) def loss(parameters, input, target): return cost(input, target) def d_loss_wrt_pars(parameters, inputs, targets): g_W, g_b = gradients(inputs, targets) return np.concatenate([g_W.flatten(), g_b]) zero_one_loss = theano.function( inputs = [x, y], outputs = classifier.errors(y), allow_input_downcast = True ) if optimizer == 'gd': print('... using gradient descent') opt = cli.GradientDescent(flat, d_loss_wrt_pars, step_rate=learning_rate, momentum=.95, args=args) elif optimizer == 'rmsprop': print('... using rmsprop') opt = cli.RmsProp(flat, d_loss_wrt_pars, step_rate=1e-4, decay=0.9, args=args) elif optimizer == 'rprop': print('... using resilient propagation') opt = cli.Rprop(flat, d_loss_wrt_pars, args=args) elif optimizer == 'adam': print('... using adaptive momentum estimation optimizer') opt = cli.Adam(flat, d_loss_wrt_pars, step_rate = 0.0002, decay = 0.99999999, decay_mom1 = 0.1, decay_mom2 = 0.001, momentum = 0, offset = 1e-08, args=args) elif optimizer == 'adadelta': print('... using adadelta') opt = cli.Adadelta(flat, d_loss_wrt_pars, step_rate=1, decay = 0.9, momentum = .95, offset = 0.0001, args=args) else: print('unknown optimizer') return 1 print('... training the model') # early stopping parameters if batch_size== None: patience = 250 else: patience = 5000 # look at this many samples regardless patience_increase = 2 # wait this mutch longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this mutch is considered signigicant validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = np.inf test_loss = 0. valid_losses = [] train_losses = [] test_losses = [] epoch = 0 start_time = timeit.default_timer() for info in opt: iter = info['n_iter'] epoch = iter // n_train_batches minibatch_index = iter % n_train_batches if iter % validation_frequency == 0: # compute zero-one loss on validation set validation_loss = zero_one_loss(valid_set_x, valid_set_y) valid_losses.append(validation_loss) train_losses.append(zero_one_loss(train_set_x, train_set_y)) test_losses.append(zero_one_loss(test_set_x, test_set_y)) print( 'epoch %i, minibatch %i/%i, validation error % f %%, iter/patience %i/%i' % ( epoch, minibatch_index + 1, n_train_batches, validation_loss * 100, iter, patience ) ) # if we got the best validation score until now if validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = validation_loss # test it on the test set test_loss = zero_one_loss(test_set_x, test_set_y) print( ' epoch %i, minibatch %i/%i, test error of best model %f %%' % ( epoch, minibatch_index + 1, n_train_batches, test_loss * 100 ) ) if patience <= iter or epoch >= n_epochs: break end_time = timeit.default_timer() print('Optimization complete with best validation score of %f %%, with test performance %f %%' % (best_validation_loss * 100., test_loss * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) losses = (train_losses, valid_losses, test_losses) return classifier, losses
def run_devise(image_vecs, image_labels, word_vecs, n_epochs, checkpoint_file, iters_per_checkpoint, iters_per_eval, validation_inds, logfile, step_rate=1e-4, decay=0.9, dm_thresh=0.1): # TODO: n_samples = len(image_labels) n_minibatch = 1 n_iters = int(np.ceil(n_epochs * n_samples / n_minibatch)) word_dim = word_vecs.shape[1] image_dim = image_vecs.shape[1] # Initialize M m_flat = np.random.randn(word_dim * image_dim) # m_flat = np.zeros(word_dim * image_dim) # m_flat = np.random.randn(word_dim * image_dim) # Beware momentum, as it can cause nonconvergence. devise_args = make_minibatch_iterator(image_vecs, image_labels, word_vecs, n_minibatch=1) #opt = climin.RmsProp(m_flat, devise_loss_one_sample, step_rate=step_rate, decay=decay, args=devise_args) opt = climin.GradientDescent(m_flat, devise_loss_one_sample, step_rate=step_rate, momentum=.95, args=devise_args) old_m_flat = np.copy(m_flat) last_validation_loss = np.nan lf = open(logfile, 'w') for info in opt: if info["n_iter"] % iters_per_checkpoint == 0: save.save(checkpoint_file, info=info, m_flat=m_flat, last_validation_loss=last_validation_loss) # No validation set yet if info["n_iter"] % iters_per_eval == 0: dm = np.linalg.norm(m_flat - old_m_flat, 1) if dm < dm_thresh: print("Optimization converged at %d iters: dm < %g." % (info["n_iter"], dm)) return (M, info) old_m_flat = np.copy(m_flat) last_validation_loss = validation_loss(m_flat, image_vecs, image_labels, word_vecs, validation_inds) print("Iter %d, dM (1-norm) = %g, validation loss = %g" % (info["n_iter"], dm, last_validation_loss)) lf.write("Iter %d, dM (1-norm) = %g, validation loss = %g\n" % (info["n_iter"], dm, last_validation_loss)) lf.flush() if info["n_iter"] == n_iters: M = np.reshape(m_flat, (word_dim, image_dim)) lf.close() return (M, info)
def climin_wrapper(oracle, w0, train_points, train_targets, options, method='AdaDelta'): default_options = { 'maxiter': 1000, 'print_freq': 1, 'verbose': False, 'g_tol': 1e-5, 'batch_size': 10, 'step_rate': 0.1 } if not options is None: default_options.update(options) if 'print_freq' in options.keys(): default_options['verbose'] = True options = default_options w = w0.copy() data = ((i, {}) for i in iter_minibatches([train_points, train_targets], options['batch_size'], [1, 0])) if method == 'AdaDelta': opt = climin.Adadelta(wrt=w, fprime=oracle, args=data, step_rate=options['step_rate']) elif method == 'SG': opt = climin.GradientDescent(wrt=w, fprime=oracle, args=data, step_rate=options['step_rate']) else: raise ValueError('Unknown optimizer') w_lst = [w.copy()] time_lst = [0.] start = time.time() n_epochs = options['maxiter'] n_iterations = int(n_epochs * train_targets.size / options['batch_size']) print_freq = int(options['print_freq'] * train_targets.size / options['batch_size']) if options['verbose']: print('Using ' + method + ' optimizer') for info in opt: i = info['n_iter'] if i > n_iterations: break if not (i % print_freq) and options['verbose']: grad = info['gradient'] print("Iteration ", int(i * options['batch_size'] / train_targets.size), ":") print("\tGradient norm", np.linalg.norm(grad)) if not i % int(train_targets.size / options['batch_size']): w_lst.append(w.copy()) time_lst.append(time.time() - start) return w.copy(), w_lst, time_lst
def run_dA(learning_rate=0.1, n_epochs=5, optimizer='gd', n_hidden=500, dataset='mnist.pkl.gz', batch_size=20, n_in=28 * 28, corruption=0.0, l1_penalty=0.0, print_reconstructions=False, print_filters=False): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] n_train_batches = train_set_x.shape[0] // batch_size x = T.matrix('x') rng = np.random.RandomState(1234) theano_rng = RandomStreams(rng.randint(2**30)) print('...building model') dims = [(n_in, n_hidden), n_hidden, n_in] flat, (vis_W, hidden_b, vis_b) = climin.util.empty_with_views(dims) # initialize with values Weights_1_init = rng.uniform(low=-4 * np.sqrt(6. / (n_hidden + n_in)), high=4 * np.sqrt(6. / (n_hidden + n_in)), size=(n_in, n_hidden)) bias_1_init = np.zeros((n_hidden, ), dtype=theano.config.floatX) bias_2_init = np.zeros((n_in, ), dtype=theano.config.floatX) def initialize_in_place(array, values): for j in range(0, len(values)): array[j] = values[j] initialize_in_place(vis_W, Weights_1_init) initialize_in_place(hidden_b, bias_1_init) initialize_in_place(vis_b, bias_2_init) params = [ theano.shared(value=vis_W, name='W', borrow=True), theano.shared(value=hidden_b, name='b', borrow=True), theano.shared(value=vis_b, name='b_prime', borrow=True) ] da = dA(numpy_rng=rng, parameters=params, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500, corruption=corruption, l1_penalty=l1_penalty) def d_loss(parameters, inputs, targets): g_W, g_hidden_b, g_vis_b = da.gradients(inputs) return np.concatenate([g_W.flatten(), g_hidden_b, g_vis_b]) if not batch_size: args = itertools.repeat(([train_set_x, train_set_y], {})) else: args = ((i, {}) for i in climin.util.iter_minibatches( [train_set_x, train_set_y], batch_size, [0, 0])) if optimizer == 'gd': print('... using gradient descent') opt = climin.GradientDescent(flat, d_loss, step_rate=learning_rate, momentum=0.95, args=args) elif optimizer == 'rmsprop': print('... using rmsprop') opt = climin.rmsprop.RmsProp(flat, d_loss, step_rate=0.01, args=args) else: print('unknown optimizer') opt = None print('...encoding') epoch = 0 start_time = timeit.default_timer() for info in opt: iter = info['n_iter'] if iter % n_train_batches == 1: epoch += 1 this_loss = da.loss(train_set_x) print('\nTraining epoch %d, cost ' % epoch, this_loss) if epoch >= n_epochs: break end_time = timeit.default_timer() training_time = (end_time - start_time) if print_filters: print( ('The no corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)), file=sys.stderr) image = Image.fromarray( tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(int(math.sqrt(n_hidden)), int(math.sqrt(n_hidden))), tile_spacing=(1, 1))) image.save('filters_' + optimizer + ' n_hidden=' + str(n_hidden) + 'corruption=' + str(corruption) + ' and l1_pen=' + str(l1_penalty) + '.png', dpi=(300, 300)) if print_reconstructions: data = train_set_x[:100] reconstruction = da.reconstructed_input(data) image = Image.fromarray( tile_raster_images(X=reconstruction, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('reconstructions of first 100_' + optimizer + ' n_hidden=' + str(n_hidden) + 'corruption=' + str(corruption) + ' and l1_pen=' + str(l1_penalty) + '.png', dpi=(300, 300))
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=200, batch_size=100, n_hidden=300, optimizer='GradientDescent', activation=T.tanh, a=(1, -0.98), b=(1, -1)): #---- Configure ---- participant = 1 series = 1 no_series = 1 datatype = 'eeg' trials_from = 1 trials_to = 'end' normalize_data = False normalize_per_trial = True keep_test_unshuffled = False #------------------- """error_lists = [None, None] for i in [ 1]: if(i == 0): normalize_data = True normalize_per_trial = False else: normalize_data = False normalize_per_trial = True""" # Get data ws = get_ws(participant=participant, series=series) windows = ws.get('win') (data, trials, led) = get_data(windows, datatype=datatype, trials_from=trials_from, trials_to=trials_to, normalize_per_trial=normalize_per_trial) for i in range(no_series - 1): ws = get_ws(participant=participant, series=series + i + 1) windows = ws.get('win') (data_temp, trials_temp, led_temp) = get_data(windows, datatype=datatype, trials_from=trials_from, trials_to=trials_to, normalize_per_trial=normalize_per_trial) data = np.vstack((data, data_temp)) trials = np.concatenate((trials, trials_temp + trials[-1])) led = np.concatenate((led, led_temp)) #Convert led vector to contain 0 for LEDoff and 1 for LEDon led_temp = np.zeros((data.shape[0], )) led_temp[led] = 1 led = led_temp #For classifying LEDon / LEDoff uncomment following line trials = led + 1 # Filtering #a = (1, -0.98) #b = (1, -1) #data = signal.filtfilt(b, a, data) n = data.shape[0] n_train = 4 * n // 9 n_valid = 2 * n // 9 n_test = n - n_train - n_valid if normalize_data: data[...] = normalize(data) if keep_test_unshuffled: (temp, undo_shuffle) = shuffle(np.c_[data[:n_train + n_valid], trials[:n_train + n_valid] - 1]) test_set_x, test_set_y = [ data[n_train + n_valid:], trials[n_train + n_valid:] - 1 ] else: (temp, undo_shuffle) = shuffle(np.c_[data, trials - 1]) test_set_x, test_set_y = (temp[n_train + n_valid:, :data.shape[1]], temp[n_train + n_valid:, data.shape[1]:]) train_set_x, train_set_y = (temp[:n_train, :data.shape[1]], temp[:n_train, data.shape[1]:]) valid_set_x, valid_set_y = (temp[n_train:n_train + n_valid, :data.shape[1]], temp[n_train:n_train + n_valid, data.shape[1]:]) #Use following line for NOT shuffled test data #test_set_x, test_set_y = (data[n_train + n_valid:, :data.shape[1]], data[n_train + n_valid:, data.shape[1]:]) # Reshaping data from (n,1) to (n,) train_set_y = train_set_y.reshape(train_set_y.shape[0], ) valid_set_y = valid_set_y.reshape(valid_set_y.shape[0], ) test_set_y = test_set_y.reshape(test_set_y.shape[0], ) n_train_batches = train_set_x.shape[0] // batch_size print('Building the Model...') x = T.matrix('x') y = T.ivector('y') rng = np.random.RandomState(1234) n_in = data.shape[1] n_out = np.unique(trials).shape[0] dims = [(n_in, n_hidden), n_hidden, (n_hidden, n_out), n_out] flat, (hidden_W, hidden_b, logreg_W, logreg_b) = climin.util.empty_with_views(dims) climin.initialize.randomize_normal(flat, loc=0, scale=0.1) #hidden_W[...] = np.asarray(rng.uniform(low=-4*np.sqrt(6. / (n_in + n_hidden)), high=4*np.sqrt(6. / (n_in + n_hidden)), size=(n_in, n_hidden))) parameters = [ theano.shared(value=hidden_W, name='W', borrow=True), theano.shared(value=hidden_b, name='b', borrow=True), theano.shared(value=logreg_W, name='W', borrow=True), theano.shared(value=logreg_b, name='b', borrow=True) ] classifier = MLP(rng=rng, input=x, n_in=n_in, n_hidden=n_hidden, n_out=n_out, activation=activation, parameters=parameters) cost = classifier.negative_log_likelihood( y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr gparams = [T.grad(cost, param) for param in classifier.params] """ Theano functions """ grad_W = theano.function([x, y], gparams, allow_input_downcast=True) #print('Setting up Climin...') """ Setting up Climin """ def d_loss(parameters, inputs, targets): g_hl_W, g_hl_b, g_lr_W, g_lr_b = grad_W(inputs, targets) return np.concatenate( [g_hl_W.flatten(), g_hl_b, g_lr_W.flatten(), g_lr_b]) minibatch = True if not minibatch: args = itertools.repeat(([train_set_x, train_set_y], {})) else: args = ((i, {}) for i in climin.util.iter_minibatches( [train_set_x, train_set_y], batch_size, [0, 0])) if optimizer == 'GradientDescent': print('Running GradientDescent') opt = climin.GradientDescent(flat, d_loss, step_rate=0.01, momentum=0.95, args=args) elif optimizer == 'RmsProp': print('Running RmsProp') opt = climin.rmsprop.RmsProp(flat, d_loss, step_rate=0.01, args=args) #elif optimizer == 'NonlinearConjugateGradient': # opt = climin.cg.NonlinearConjugateGradient(d_loss, loss, d_loss, min_grad=1e-06, args=args) elif optimizer == 'Adadelta': print('Running Adadelta') opt = climin.adadelta.Adadelta(flat, d_loss, step_rate=0.01, decay=0.9, momentum=0, offset=0.001, args=args) elif optimizer == 'Adam': print('Running Adam') opt = climin.adam.Adam(flat, d_loss, step_rate=0.001, decay=0.3, decay_mom1=0.1, decay_mom2=0.001, momentum=0, offset=1e-08, args=args) elif optimizer == 'Rprop': print('Running Rprop') opt = climin.rprop.Rprop(flat, d_loss, step_shrink=0.5, step_grow=1.2, min_step=1e-06, max_step=1, changes_max=0.1, args=args) else: print('Optimizer not available!') opt = None zero_one_loss = theano.function( inputs=[x, y], outputs=classifier.logRegressionLayer.errors(y), allow_input_downcast=True) p_y_given_x = theano.function( inputs=[x], outputs=classifier.logRegressionLayer.p_y_given_x, allow_input_downcast=True) print('Running Optimization...\n') print('Classifying %d classes' % n_out) patience = 10000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = n_train_batches #min(n_train_batches, patience // 2) best_validation_loss = np.inf best_iter = 0 start_time = timeit.default_timer() epoch = 0 done_looping = False train_error_list = [] valid_error_list = [] test_error_list = [] #model = Model(classifier.params) train_score = zero_one_loss(train_set_x, train_set_y) * 100 this_validation_loss = zero_one_loss(valid_set_x, valid_set_y) * 100 test_score = zero_one_loss(test_set_x, test_set_y) * 100 train_error_list.append(train_score) valid_error_list.append(this_validation_loss) test_error_list.append(test_score) for info in opt: iter = info['n_iter'] """if (iter % 1)==0: stdout.write("\r%f%% of Epoch %d" % (float(iter * 100)/n_train_batches - epoch * 100, epoch)) stdout.flush()""" if (iter + 1) % validation_frequency == 1: epoch += 1 train_score = zero_one_loss(train_set_x, train_set_y) * 100 this_validation_loss = zero_one_loss(valid_set_x, valid_set_y) * 100 test_score = zero_one_loss(test_set_x, test_set_y) * 100 train_error_list.append(train_score) valid_error_list.append(this_validation_loss) test_error_list.append(test_score) print('\nEpoch %i, Validation Error:\t %f%%' % (epoch, this_validation_loss)) if this_validation_loss < best_validation_loss: if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_test_score = test_score best_iter = iter print(('Epoch %i, Test Error:\t %f%% \t NEW MODEL') % (epoch, test_score)) p_LEDon = p_y_given_x(test_set_x)[:, 1] #with open('model.pkl', 'wb') as f: #print('Dump Model') # pickle.dump(model, f) if (epoch >= n_epochs) or done_looping: break print('') if patience <= iter: done_looping = True break #scores = Scores(train_error_list, valid_error_list, test_error_list, [best_validation_loss, test_score]) #with open('scores.pkl', 'wb') as f: # pickle.dump(scores, f) end_time = timeit.default_timer() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss, best_iter + 1, best_test_score)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) #error_lists[i] = (train_error_list, valid_error_list, test_error_list) return (train_error_list, valid_error_list, test_error_list), (best_validation_loss, best_test_score), (test_set_y, p_LEDon)
def build_and_train_rbf(self, X, Y): y_onehot = self.class_to_onehot(Y) n_dims = y_onehot.shape[1] centers = self.compute_centers(X) x = T.dmatrix() y = T.imatrix() #bias, centers, sigmas, weights template = [ n_dims, centers.shape, self.l1_size, (self.l1_size, n_dims) ] #initialize and train RBF network model = theano_rbfnet(input=x, n_cents=self.l1_size, centers=centers, n_dims=n_dims, reg=self.penalty) cost = model.neg_log_likelihood(y) g_b = T.grad(cost, model.b) g_c = T.grad(cost, model.c) g_s = T.grad(cost, model.s) g_w = T.grad(cost, model.w) g_params = T.concatenate( [g_b.flatten(), g_c.flatten(), g_s.flatten(), g_w.flatten()]) getcost = theano.function([x, y], outputs=cost) getdcost = theano.function([x, y], outputs=g_params) def cost_fcn(params, inputs, targets): model.set_params(params, template) x = inputs y = targets return getcost(x, y) def cost_grad(params, inputs, targets): model.set_params(params, template) x = inputs y = targets return getdcost(x, y) args = climin.util.iter_minibatches([X, y_onehot], self.batch_size, [0, 0]) batch_args = itertools.repeat(([X, y_onehot], {})) args = ((i, {}) for i in args) init_params = model.get_params(template) opt_sgd = climin.GradientDescent(init_params, cost_fcn, cost_grad, steprate=0.1, momentum=0.99, args=args, momentum_type="nesterov") opt_ncg = climin.NonlinearConjugateGradient(init_params, cost_fcn, cost_grad, args=batch_args) opt_lbfgs = climin.Lbfgs(init_params, cost_fcn, cost_grad, args=batch_args) #choose the optimizer if self.optimizer == 'sgd': optimizer = opt_sgd elif self.optimizer == 'ncg': optimizer = opt_ncg else: optimizer = opt_lbfgs #do the actual training. costs = [] for itr_info in optimizer: if itr_info['n_iter'] > self.max_iters: break costs.append(itr_info['loss']) model.set_params(init_params, template) return model, costs
def build_and_train_nnet(self, X, Y): y_onehot = self.class_to_onehot(Y) n_in = X.shape[1] n_nodes = self.l1_size n_out = y_onehot.shape[1] x = T.dmatrix() y = T.imatrix() #bias1, bias2, weights1, weights2 template = [(n_nodes, ), (n_out, ), (n_in, n_nodes), (n_nodes, n_out)] #initialize nnet model = nnet(input=x, n_in=n_in, n_nodes=n_nodes, n_out=n_out) cost = model.neg_log_likelihood(y) g_b1 = T.grad(cost, model.b1) g_b2 = T.grad(cost, model.b2) g_w1 = T.grad(cost, model.w1) g_w2 = T.grad(cost, model.w2) g_params = T.concatenate( [g_b1.flatten(), g_b2.flatten(), g_w1.flatten(), g_w2.flatten()]) getcost = theano.function([x, y], outputs=cost) getdcost = theano.function([x, y], outputs=g_params) def cost_fcn(params, inputs, targets): model.set_params(params, template) x = inputs y = targets return getcost(x, y) def cost_grad(params, inputs, targets): model.set_params(params, template) x = inputs y = targets return getdcost(x, y) args = climin.util.iter_minibatches([X, y_onehot], self.batch_size, [0, 0]) batch_args = itertools.repeat(([X, y_onehot], {})) args = ((i, {}) for i in args) init_params = model.get_params(template) opt_sgd = climin.GradientDescent(init_params, cost_fcn, cost_grad, steprate=0.01, momentum=0.99, args=args, momentum_type="nesterov") opt_ncg = climin.NonlinearConjugateGradient(init_params, cost_fcn, cost_grad, args=batch_args) opt_lbfgs = climin.Lbfgs(init_params, cost_fcn, cost_grad, args=batch_args) #choose the optimizer if self.optimizer == 'sgd': optimizer = opt_sgd elif self.optimizer == 'ncg': optimizer = opt_ncg else: optimizer = opt_lbfgs #do the actual training. costs = [] for itr_info in optimizer: if itr_info['n_iter'] > self.max_iters: break costs.append(itr_info['loss']) model.set_params(init_params, template) return model, costs
if method == 'adam': opt = climin.Adam(model.optimizer_array, model.stochastic_grad, step_rate=0.005, decay_mom1=1 - 0.9, decay_mom2=1 - 0.999) ELBO.append(model.log_likelihood()) #NLPD.append(model.negative_log_predictive(Xtest, Ytest, num_samples=1000)) start = time.time() myTimes.append(start) print('Running Adam...') info = opt.minimize_until(callback) elif method == 'sgd': opt = climin.GradientDescent(model.optimizer_array, model.stochastic_grad, step_rate=1e-15, momentum=0.0) ELBO.append(model.log_likelihood()) #NLPD.append(model.negative_log_predictive(Xtest, Ytest, num_samples=1000)) start = time.time() myTimes.append(start) print('Running SGD...') info = opt.minimize_until(callback) elif method == 'adad': opt = climin.Adadelta(model.optimizer_array, model.stochastic_grad, step_rate=0.005, momentum=0.9) ELBO.append(model.log_likelihood()) #NLPD.append(model.negative_log_predictive(Xtest, Ytest, num_samples=1000)) start = time.time()