def mnist(model_args=None, run_args=None): """Test on the MNIST (digit classification) dataset.""" # download dataset at http://deeplearning.net/data/mnist/mnist.pkl.gz with open("mnist.pkl", "rb") as f: train, _, test = pickle.load(f) if model_args is None: ff = hf.FFNet([28 * 28, 1024, 512, 256, 32, 10], layers=([hf.nl.Linear()] + [hf.nl.ReLU()] * 4 + [hf.nl.Softmax()]), use_GPU=True, debug=False) else: ff = hf.FFNet([28 * 28, 1024, 512, 256, 32, 10], layers=([hf.nl.Linear()] + [hf.nl.ReLU()] * 4 + [hf.nl.Softmax()]), **model_args) inputs = train[0] targets = np.zeros((inputs.shape[0], 10), dtype=np.float32) targets[np.arange(inputs.shape[0]), train[1]] = 0.9 targets += 0.01 tmp = np.zeros((test[0].shape[0], 10), dtype=np.float32) tmp[np.arange(test[0].shape[0]), test[1]] = 0.9 tmp += 0.01 test = (test[0], tmp) if run_args is None: ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=250, init_damping=45), batch_size=7500, test=test, max_epochs=1000, test_err=hf.loss_funcs.ClassificationError(), plotting=True) else: CG_iter = run_args.pop("CG_iter", 250) init_damping = run_args.pop("init_damping", 45) ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter, init_damping), test=test, test_err=hf.loss_funcs.ClassificationError(), **run_args) output = ff.forward(test[0], ff.W) print("classification error", hf.loss_funcs.ClassificationError().batch_loss(output, test[1]))
def threshold_calc_G(): """Compare GPU vs CPU performance on feedforward curvature calculation. This can use this to determine whether it is better to run some target network on the CPU or GPU.""" batch_size = range(256, 1025, 256) layer_size = [1] + range(64, 513, 64) reps = 100 times = np.zeros((len(batch_size), len(layer_size), 2)) for i, b in enumerate(batch_size): inputs = np.random.randn(b, 1).astype(np.float32) targets = np.random.randn(b, 1).astype(np.float32) for j, n in enumerate(layer_size): ff = hf.FFNet([1, n, n, 1], use_GPU=False) ff.cache_minibatch(inputs, targets) v = np.random.randn(ff.W.size).astype(np.float32) for _ in range(5): ff.calc_G(v) start = time.time() for _ in range(reps): ff.calc_G(v) times[i, j, 0] = time.time() - start ff = hf.FFNet([1, n, n, 1], use_GPU=True) ff.cache_minibatch(inputs, targets) v = gpuarray.to_gpu(v) for _ in range(5): ff.GPU_calc_G(v) start = time.time() for _ in range(reps): ff.GPU_calc_G(v) v = v.get() times[i, j, 1] = time.time() - start print "b", b, "n", n, "times", times[i, j] print times[..., 1] - times[..., 0] print "batch size (%s) vs layer size (%s)" % (batch_size, layer_size) print " (True indicates GPU is faster)" print times[..., 1] < times[..., 0]
def test_ff_CG(use_GPU): rng = np.random.RandomState(0) inputs = rng.randn(100, 1).astype(np.float32) targets = rng.randn(100, 1).astype(np.float32) ff = hf.FFNet([1, 10, 1], debug=False, use_GPU=use_GPU, rng=rng) ff.optimizer = hf.opt.HessianFree() ff.cache_minibatch(inputs, targets) deltas = ff.optimizer.conjugate_gradient(np.zeros(ff.W.size, dtype=np.float32), ff.calc_grad(), iters=20, printing=False) assert deltas[0][0] == 3 assert np.allclose(deltas[0][1], [ -0.01693734, 0.00465961, 0.00173045, -0.00414165, -0.03843474, 0.00636764, 0.01423731, -0.00433618, -0.00335347, 0.00935241, 0.01242893, -0.00339621, -0.00137015, 0.00311182, 0.02883433, -0.00534688, -0.01032545, 0.00328636, 0.00244868, -0.00678817, -0.02461342, -0.02293827, -0.00737021, -0.01145663, -0.0116213, -0.03512985, -0.02004906, -0.02885171, -0.01596764, -0.02105034, -0.03943678 ], atol=1e-5)
def connections(): """A network with non-standard connectivity between layers.""" inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0], [1], [1], [0]], dtype=np.float32) ff = hf.FFNet([2, 5, 5, 1], layers=hf.nl.Tanh(), conns={ 0: [1, 2], 1: [2, 3], 2: [3] }) ff.run_epochs(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=2), max_epochs=40, plotting=True) outputs = ff.forward(inputs)[-1] for i in range(4): print("-" * 20) print("input", inputs[i]) print("target", targets[i]) print("output", outputs[i])
def crossentropy(): """Example of a network using cross-entropy error.""" inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[1, 0], [0, 1], [0, 1], [1, 0]], dtype=np.float32) ff = hf.FFNet([2, 5, 2], layers=[hf.nl.Linear(), hf.nl.Tanh(), hf.nl.Softmax()], loss_type=hf.loss_funcs.CrossEntropy()) ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=2), max_epochs=40, plotting=True) # using gradient descent (for comparison) # ff.run_batches(inputs, targets, optimizer=SGD(l_rate=1), # max_epochs=10000, plotting=True) outputs = ff.forward(inputs, ff.W)[-1] for i in range(4): print "-" * 20 print "input", inputs[i] print "target", targets[i] print "output", outputs[i]
def sparsity(): """Example of a network with a loss function imposing sparsity on the neural activities.""" inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[1, 0], [0, 1], [0, 1], [1, 0]], dtype=np.float32) ff = hf.FFNet([2, 8, 2], layers=[hf.nl.Linear(), hf.nl.Logistic(), hf.nl.Softmax()], loss_type=[ hf.loss_funcs.CrossEntropy(), hf.loss_funcs.SparseL1(0.1, target=0) ]) # TODO: change this to SparseL2 ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=10), max_epochs=100, plotting=True) # using gradient descent (for comparison) # ff.run_batches(inputs, targets, optimizer=SGD(l_rate=1.0), # max_epochs=10000, plotting=True) output = ff.forward(inputs, ff.W) for i in range(4): print "-" * 20 print "input", inputs[i] print "target", targets[i] print "output", output[-1][i] print "activity", np.mean(output[1][i])
def test_testerr(use_GPU): inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0, 1], [1, 0], [1, 0], [0, 1]], dtype=np.float32) ff = hf.FFNet([2, 5, 2], layers=[hf.nl.Linear(), hf.nl.Tanh(), hf.nl.Softmax()], debug=True, loss_type=hf.loss_funcs.CrossEntropy(), use_GPU=use_GPU) err = hf.loss_funcs.ClassificationError() ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=50), max_epochs=100, test_err=err, target_err=-1, print_period=None) outputs = ff.forward(inputs, ff.W) assert ff.loss.batch_loss(outputs, targets) < 1e-4 print outputs[-1] assert err.batch_loss(outputs, targets) == 0.0
def xor(use_hf=True): """Run a basic xor training test. :param bool use_hf: if True run example using Hessian-free optimization, otherwise use stochastic gradient descent """ inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0], [1], [1], [0]], dtype=np.float32) ff = hf.FFNet([2, 5, 1]) if use_hf: ff.run_epochs(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=2), max_epochs=40, plotting=True) else: # using gradient descent (for comparison) ff.run_epochs(inputs, targets, optimizer=hf.opt.SGD(l_rate=1), max_epochs=10000, plotting=True) outputs = ff.forward(inputs)[-1] for i in range(4): print("-" * 2) print("input", inputs[i]) print("target", targets[i]) print("output", outputs[i])
def connections(): """Example of a network with non-standard connectivity between layers.""" inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0], [1], [1], [0]], dtype=np.float32) ff = hf.FFNet([2, 5, 5, 1], layers=hf.nl.Tanh(), conns={ 0: [1, 2], 1: [2, 3], 2: [3] }) ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=2), max_epochs=40, plotting=True) # using gradient descent (for comparison) # ff.run_batches(inputs, targets, optimizer=SGD(l_rate=1), # max_epochs=10000, plotting=True) outputs = ff.forward(inputs, ff.W)[-1] for i in range(4): print "-" * 20 print "input", inputs[i] print "target", targets[i] print "output", outputs[i]
def test_ff_calc_G(dtype): inputs = np.random.randn(1000, 1).astype(dtype) ff = hf.FFNet([1, 10, 1], debug=(dtype == np.float64), use_GPU=True) ff.cache_minibatch(inputs, inputs) v = np.random.randn(ff.W.size).astype(dtype) gpu_Gv = ff.GPU_calc_G(v) cpu_Gv = ff.calc_G(v) assert np.allclose(gpu_Gv, cpu_Gv, rtol=1e-4)
def test_stripped_batch(use_GPU): inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0], [1], [1], [0]], dtype=np.float32) ff = hf.FFNet([2, 5, 1], debug=True, use_GPU=use_GPU) W_copy = ff.W.copy() ff.run_epochs(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=2), max_epochs=20, print_period=None) ff2 = hf.FFNet([2, 5, 1], debug=True, use_GPU=use_GPU, load_weights=W_copy) ff2.optimizer = hf.opt.HessianFree(CG_iter=2) for _ in range(20): ff2._run_epoch(inputs, targets) assert np.allclose(ff.forward(inputs)[-1], ff2.forward(inputs)[-1])
def test_SGD(use_GPU): inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0], [1], [1], [0]], dtype=np.float32) ff = hf.FFNet([2, 5, 1], debug=False, use_GPU=use_GPU) ff.run_batches(inputs, targets, optimizer=hf.opt.SGD(l_rate=1), max_epochs=10000, print_period=None) outputs = ff.forward(inputs, ff.W) assert ff.loss.batch_loss(outputs, targets) < 1e-3
def test_xor(use_GPU): inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0], [1], [1], [0]], dtype=np.float32) ff = hf.FFNet([2, 5, 1], debug=True, use_GPU=use_GPU) ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=2), max_epochs=40, print_period=None) outputs = ff.forward(inputs, ff.W) assert ff.loss.batch_loss(outputs, targets) < 1e-5
def test_softlif(use_GPU): inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0.1], [1], [1], [0.1]], dtype=np.float32) lifs = hf.nl.SoftLIF(sigma=1, tau_ref=0.002, tau_rc=0.02, amp=0.01) ff = hf.FFNet([2, 10, 1], layers=lifs, debug=True, use_GPU=use_GPU) ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=50), max_epochs=50, print_period=None) outputs = ff.forward(inputs, ff.W) assert ff.loss.batch_loss(outputs, targets) < 1e-5
def test_asym_dact(use_GPU): class Roll(hf.nl.Nonlinearity): def activation(self, x): return np.roll(x, 1, axis=-1) def d_activation(self, x, _): d_act = np.roll(np.eye(x.shape[-1], dtype=x.dtype), 1, axis=0) return np.resize(d_act, np.concatenate( (x.shape[:-1], d_act.shape))) inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0], [1], [1], [0]], dtype=np.float32) ff = hf.FFNet([2, 5, 1], layers=Roll(), debug=True, use_GPU=use_GPU) ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=2), max_epochs=40, print_period=None)
def profile_calc_G(cprofile=True): """Run a profiler on the feedforward curvature calculation. :param bool cprofile: use True if profiling on the CPU, False if using the CUDA profiler """ inputs = np.random.randn(1024, 1).astype(np.float32) targets = np.random.randn(1024, 1).astype(np.float32) N = 1024 ff = hf.FFNet([1, N, N, 1], use_GPU=True) ff.cache_minibatch(inputs, targets) v = np.random.randn(ff.W.size).astype(np.float32) for _ in range(5): # run it a few times to get rid of any startup overhead ff.GPU_calc_G(v) if cprofile: start = time.time() p = Profile() p.enable() else: pycuda.driver.start_profiler() for _ in range(500): _ = ff.GPU_calc_G(v) if cprofile: p.disable() print "time", time.time() - start ps = pstats.Stats(p) ps.strip_dirs().sort_stats('time').print_stats(20) else: pycuda.driver.stop_profiler()
def test_sparsity(use_GPU): inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0], [1], [1], [0]], dtype=np.float32) ff = hf.FFNet([2, 8, 1], debug=True, use_GPU=use_GPU, loss_type=[ hf.loss_funcs.SquaredError(), hf.loss_funcs.SparseL2(0.01, target=0) ]) ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=50), max_epochs=100, print_period=None) outputs = ff.forward(inputs, ff.W) assert ff.loss.batch_loss(outputs, targets) < 1e-2 assert np.mean(outputs[1]) < 0.1
def test_connections(use_GPU): inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0], [1], [1], [0]], dtype=np.float32) ff = hf.FFNet([2, 5, 5, 1], layers=hf.nl.Tanh(), debug=True, conns={ 0: [1, 2], 1: [3], 2: [3] }, use_GPU=use_GPU) ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=50), max_epochs=50, print_period=None) outputs = ff.forward(inputs, ff.W) assert ff.loss.batch_loss(outputs, targets) < 1e-5
def test_crossentropy(use_GPU): inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray( [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) ff = hf.FFNet([2, 5, 4], layers=[hf.nl.Linear(), hf.nl.Tanh(), hf.nl.Softmax()], debug=True, loss_type=hf.loss_funcs.CrossEntropy(), use_GPU=use_GPU) ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=50), max_epochs=100, print_period=None) outputs = ff.forward(inputs, ff.W) assert ff.loss.batch_loss(outputs, targets) < 1e-5
def crossentropy(): """A network that modifies the layer types and loss function.""" inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[1, 0], [0, 1], [0, 1], [1, 0]], dtype=np.float32) ff = hf.FFNet([2, 5, 2], layers=[hf.nl.Linear(), hf.nl.Tanh(), hf.nl.Softmax()], loss_type=hf.loss_funcs.CrossEntropy()) ff.run_epochs(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=2), max_epochs=40, plotting=True) outputs = ff.forward(inputs)[-1] for i in range(4): print("-" * 20) print("input", inputs[i]) print("target", targets[i]) print("output", outputs[i])
def xor(): """Run a basic xor training test.""" inputs = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) targets = np.asarray([[0], [1], [1], [0]], dtype=np.float32) ff = hf.FFNet([2, 5, 1]) ff.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=2), max_epochs=40, plotting=True) # using gradient descent (for comparison) # ff.run_batches(inputs, targets, optimizer=SGD(l_rate=1), # max_epochs=10000, plotting=True) outputs = ff.forward(inputs, ff.W)[-1] for i in range(4): print "-" * 20 print "input", inputs[i] print "target", targets[i] print "output", outputs[i]
pshape = lambda a_list: [w.shape for w in a_list] # define hyperparameters layers = (len(n_nodes) - 1) * ['ReLU'] + [ 'Linear' ] # all relu except linear for output layer n_nodes = [42, 24, 12, 1] # number of units per layer batch_size = 1024 # initialize a hessian free model with GPU use optional ff = hf.FFNet(n_nodes, layers=layers, loss_type=mse(), W_init_params={ "coeff": 1.0, "biases": 1.0, "init_type": 'gaussian' }, use_GPU=0) ff.run_epochs(X, ret, test=(X_val, ret_val), minibatch_size=1024, optimizer=hf.opt.HessianFree(CG_iter=2), max_epochs=50, plotting=True, print_period=None) print 'After fitting on the training set for 100 epochs, hessian free return this weight parameter'
def mnist(model_args=None, run_args=None): """Test on the MNIST (digit classification) dataset. Download dataset at http://deeplearning.net/data/mnist/mnist.pkl.gz :param dict model_args: kwargs that will be passed to the :class:`.FFNet` constructor :param dict run_args: kwargs that will be passed to :meth:`.run_epochs` """ with open("mnist.pkl", "rb") as f: try: train, _, test = pickle.load(f) except UnicodeDecodeError: # python 3 with open("mnist.pkl", "rb") as f2: train, _, test = pickle.load(f2, encoding="bytes") if model_args is None: ff = hf.FFNet([28 * 28, 1024, 512, 256, 32, 10], layers=([hf.nl.Linear()] + [hf.nl.ReLU()] * 4 + [hf.nl.Softmax()]), use_GPU=True, debug=False) else: ff = hf.FFNet([28 * 28, 1024, 512, 256, 32, 10], layers=([hf.nl.Linear()] + [hf.nl.ReLU()] * 4 + [hf.nl.Softmax()]), **model_args) inputs = train[0] targets = np.zeros((inputs.shape[0], 10), dtype=np.float32) targets[np.arange(inputs.shape[0]), train[1]] = 0.9 targets += 0.01 tmp = np.zeros((test[0].shape[0], 10), dtype=np.float32) tmp[np.arange(test[0].shape[0]), test[1]] = 0.9 tmp += 0.01 test = (test[0], tmp) if run_args is None: ff.run_epochs(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=250, init_damping=45), minibatch_size=7500, test=test, max_epochs=125, test_err=hf.loss_funcs.ClassificationError(), plotting=True) else: CG_iter = run_args.pop("CG_iter", 250) init_damping = run_args.pop("init_damping", 45) ff.run_epochs(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter, init_damping), test=test, test_err=hf.loss_funcs.ClassificationError(), **run_args) output = ff.forward(test[0]) print("classification error", hf.loss_funcs.ClassificationError().batch_loss(output, test[1]))