def test_sae_cost(): threshold = 1e-9 * (num_samples / 50.0) theta = sparse_autoencoder.initialize_params(hidden_size, visible_size) sae_cost = partial(base_sae_cost, weight_decay=weight_decay, beta=beta) cost, grad, ngrad = check_grad(sae_cost, theta, threshold) # test that if gradient is wrong, we fail bad_grad = np.array(grad) bad_grad[2] = 1000 assert diff_grad(ngrad, bad_grad) > threshold bad_grad2 = 2 * np.array(grad) assert diff_grad(ngrad, bad_grad2) > threshold # test that weight params actually do something if weight_decay > 0: noweight_sae_cost = partial(base_sae_cost, weight_decay=0, beta=beta) noweight_cost, noweight_grad, _ = check_grad(noweight_sae_cost, theta, threshold) print "noweight cost:", noweight_cost diff = diff_grad(grad, noweight_grad) print "noweight diff:", diff assert diff > threshold # test that sparsity works if beta > 0: nosparsity_sae_cost = partial(base_sae_cost, weight_decay=weight_decay, beta=0) nosparsity_cost, nosparsity_grad, _ = check_grad(nosparsity_sae_cost, theta, threshold) print "nosparsity cost:", nosparsity_cost diff = diff_grad(grad, nosparsity_grad) print "nosparsity diff:", diff assert diff > threshold
# Network Architecture visible_size = data.shape[1] hidden_size = 300 # Training params weight_decay = 3e-3 sparsity_param = 0.1 beta = 3 max_iter = 500 # Maximum number of iterations of L-BFGS to run # Get the data num_samples = data.shape[0] # set up L-BFGS args theta = sparse_autoencoder.initialize_params(hidden_size, visible_size) sae_cost = partial(sparse_autoencoder.cost, visible_size=visible_size, hidden_size=hidden_size, weight_decay = weight_decay, beta=beta, sparsity_param=sparsity_param, data=data.T) # Train! trained, cost, d = scipy.optimize.lbfgsb.fmin_l_bfgs_b(sae_cost, theta, maxfun=max_iter, m=100, factr=1.0, pgtol=1e-100,
softmax_weight_decay = 1e-4 l2_weight_decay = 3e-3 l3_weight_decay = 3e-3 sparsity_param = 0.1 beta = 3 max_iter = 400 num_samples = 1000000 get_data = sample_images.get_mnist_data train_patches, train_labels = get_data('../data/mnist.pkl.gz', train=True, num_samples=num_samples) print 'will train layer 2 model' # set up L-BFGS args theta = sparse_autoencoder.initialize_params(hidden_size, visible_size) sae_cost = partial(sparse_autoencoder.cost, visible_size=visible_size, hidden_size=hidden_size, weight_decay=l2_weight_decay, beta=beta, sparsity_param=sparsity_param, data=train_patches) # Train! l2_model, cost, d = scipy.optimize.lbfgsb.fmin_l_bfgs_b(sae_cost, theta, maxfun=max_iter, m=1, factr=10.0, pgtol=1e-8,
# Sparsity parameter (desired average activation of each hidden layer neuron) ρ = 0.1 # Weight decay parameter λ = 3e-3 # Weight of sparsity penalty term β = 3 ############################################################################################ ############################################################################################ ############################################################################################ # =============================== Train ================================================== # ############################################################################################ # Initialize parameters of the model θ = sparse_autoencoder.initialize_params(hidden_size, input_size) # Declare cost function J = lambda θ: sparse_autoencoder.calc_cost_n_gradient( θ, input_size, hidden_size, λ, ρ, β, images) # Set training options options_ = {'maxiter': 1000, 'disp': True} # Minimize cost function J by modifying parameters θ using L-BFGS-B optimization algo result = scipy.optimize.minimize(J, θ, method='L-BFGS-B', jac=True, options=options_) # Get optimized parameter vector opt_θ = result.x print(result)