def threshold_rnn_calc_G(): """Compare GPU vs CPU performance on recurrent curvature calculation. This can use this to determine whether it is better to run some target network on the CPU or GPU.""" batch_size = 1024 layer_size = [1] + range(32, 129, 32) sig_len = [1] + range(8, 33, 8) reps = 100 times = np.zeros((len(sig_len), len(layer_size), 2)) for i, b in enumerate(sig_len): inputs = np.random.randn(batch_size, b, 1).astype(np.float32) targets = np.random.randn(batch_size, b, 1).astype(np.float32) for j, n in enumerate(layer_size): rnn = hf.RNNet([1, n, 1], use_GPU=False) rnn.cache_minibatch(inputs, targets) v = np.random.randn(rnn.W.size).astype(np.float32) for _ in range(5): rnn.calc_G(v) start = time.time() for _ in range(reps): rnn.calc_G(v) times[i, j, 0] = time.time() - start rnn = hf.RNNet([1, n, 1], use_GPU=True) rnn.cache_minibatch(inputs, targets) v = gpuarray.to_gpu(v) for _ in range(5): rnn.GPU_calc_G(v) start = time.time() for _ in range(reps): rnn.GPU_calc_G(v) v = v.get() times[i, j, 1] = time.time() - start print "b", b, "n", n, "times", times[i, j] print times[..., 1] - times[..., 0] print "signal length (%s) versus layer size (%s)" % (sig_len, layer_size) print " (True indicates GPU is faster)" print times[..., 1] < times[..., 0]
def test_asym_dact(use_GPU): class Roll(hf.nl.Nonlinearity): def activation(self, x): return np.roll(x, 1, axis=-1) def d_activation(self, x, _): d_act = np.roll(np.eye(x.shape[-1], dtype=x.dtype), 1, axis=0) return np.resize(d_act, np.concatenate( (x.shape[:-1], d_act.shape))) n_inputs = 3 sig_len = 5 inputs = np.outer(np.linspace(0.1, 0.9, n_inputs), np.ones(sig_len))[:, :, None] targets = np.outer(np.linspace(0.1, 0.9, n_inputs), np.linspace(0, 1, sig_len))[:, :, None] inputs = inputs.astype(np.float32) targets = targets.astype(np.float32) rnn = hf.RNNet(shape=[1, 5, 1], layers=Roll(), debug=True, use_GPU=use_GPU) rnn.run_batches(inputs, targets, optimizer=HessianFree(CG_iter=100), max_epochs=30, print_period=None)
def test_continuous(use_GPU): n_inputs = 3 sig_len = 5 nl = Continuous(Logistic(), tau=np.random.uniform(1, 3, size=5), dt=0.9) inputs = np.outer(np.linspace(0.1, 0.9, n_inputs), np.ones(sig_len))[:, :, None] targets = np.outer(np.linspace(0.1, 0.9, n_inputs), np.linspace(0, 1, sig_len))[:, :, None] inputs = inputs.astype(np.float32) targets = targets.astype(np.float32) rnn = hf.RNNet(shape=[1, 5, 1], layers=[Linear(), nl, Logistic()], debug=True, use_GPU=use_GPU) rnn.run_batches(inputs, targets, optimizer=HessianFree(CG_iter=100), max_epochs=30, print_period=None) outputs = rnn.forward(inputs, rnn.W) assert rnn.loss.batch_loss(outputs, targets) < 1e-4
def test_strucdamping(use_GPU): n_inputs = 3 sig_len = 5 inputs = np.outer(np.linspace(0.1, 0.9, n_inputs), np.ones(sig_len))[:, :, None] targets = np.outer(np.linspace(0.1, 0.9, n_inputs), np.linspace(0, 1, sig_len))[:, :, None] inputs = inputs.astype(np.float32) targets = targets.astype(np.float32) optimizer = HessianFree(CG_iter=100) rnn = hf.RNNet(shape=[1, 5, 1], loss_type=[ hf.loss_funcs.SquaredError(), hf.loss_funcs.StructuralDamping(0.1, optimizer=optimizer) ], debug=True, use_GPU=use_GPU) rnn.run_batches(inputs, targets, optimizer=optimizer, max_epochs=30, print_period=None) outputs = rnn.forward(inputs, rnn.W) assert rnn.loss.batch_loss(outputs, targets) < 1e-4
def test_rnn_CG(use_GPU): rng = np.random.RandomState(0) inputs = rng.randn(100, 10, 2).astype(np.float32) targets = rng.randn(100, 10, 1).astype(np.float32) rnn = hf.RNNet([2, 5, 1], debug=False, use_GPU=use_GPU, rng=rng) rnn.optimizer = hf.opt.HessianFree() rnn.cache_minibatch(inputs, targets) deltas = rnn.optimizer.conjugate_gradient(np.zeros(rnn.W.size, dtype=np.float32), rnn.calc_grad(), iters=20, printing=False) assert deltas[1][0] == 6 assert np.allclose(deltas[1][1], [ 2.88910931e-03, -1.08404364e-02, 6.17342826e-04, -1.85968506e-03, 1.71574634e-02, 3.08436429e-04, -5.35693355e-02, -2.39962409e-03, 5.33994753e-03, 3.52956937e-03, 1.83414537e-02, -1.20746918e-01, 4.14435379e-03, 5.21760620e-03, 7.41007701e-02, -2.86964715e-01, -2.21885830e-01, -3.84823292e-01, -2.63742000e-01, -9.64779630e-02, -4.55241114e-01, 9.68043320e-03, -5.81301711e-02, 1.87756377e-03, 3.52657953e-05, 3.19301970e-02, 7.79627683e-03, -4.76030372e-02, 1.58238632e-03, 1.87149423e-03, 2.43508108e-02, 1.32407937e-02, -8.43726397e-02, 2.58994917e-03, 2.43114564e-03, 4.95423339e-02, 1.13963615e-02, -7.54035711e-02, 2.11156602e-03, 4.81781084e-03, 4.49908487e-02, 4.63910261e-03, -3.11208423e-02, 1.24892767e-03, 2.63486174e-03, 1.77674163e-02, 1.60023139e-03, -1.40727460e-02, 7.28542393e-04, 6.10395044e-04, 1.20819537e-02 ], atol=1e-5)
def test_rnn_calc_G(dtype): inputs = np.random.randn(1000, 10, 1).astype(dtype) rnn = hf.RNNet([1, 10, 1], debug=(dtype == np.float64), use_GPU=True) rnn.cache_minibatch(inputs, inputs) rnn.optimizer = hf.opt.HessianFree() v = np.random.randn(rnn.W.size).astype(dtype) gpu_Gv = rnn.GPU_calc_G(v) cpu_Gv = rnn.calc_G(v) assert np.allclose(gpu_Gv, cpu_Gv, rtol=1e-4)
def test_truncation(use_GPU): n_inputs = 2 sig_len = 6 inputs = np.ones((n_inputs, sig_len, 1), dtype=np.float32) * 0.5 targets = np.ones((n_inputs, sig_len, 1), dtype=np.float32) * 0.5 rnn = hf.RNNet(shape=[1, 8, 1], debug=True, use_GPU=use_GPU, truncation=(3, 3)) rnn.run_epochs(inputs, targets, optimizer=HessianFree(CG_iter=100), max_epochs=10, print_period=None)
def profile_rnn_calc_G(cprofile=True): """Run a profiler on the recurrent curvature calculation. :param bool cprofile: use True if profiling on the CPU, False if using the CUDA profiler """ inputs = np.random.randn(1024, 128, 1).astype(np.float32) targets = np.random.randn(1024, 128, 1).astype(np.float32) N = 128 rnn = hf.RNNet([1, N, 1], use_GPU=True) rnn.optimizer = hf.opt.HessianFree() # for struc_damping check rnn.cache_minibatch(inputs, targets) v = np.random.randn(rnn.W.size).astype(np.float32) for _ in range(2): # run it a few times to get rid of any startup overhead rnn.GPU_calc_G(v) if cprofile: start = time.time() p = Profile() p.enable() else: pycuda.driver.start_profiler() for _ in range(100): _ = rnn.GPU_calc_G(v) if cprofile: p.disable() print "time", time.time() - start ps = pstats.Stats(p) ps.strip_dirs().sort_stats('time').print_stats(20) else: pycuda.driver.stop_profiler()
def test_truncation(use_GPU): n_inputs = 2 sig_len = 6 inputs = np.ones((n_inputs, sig_len, 1), dtype=np.float32) * 0.5 targets = np.ones((n_inputs, sig_len, 1), dtype=np.float32) * 0.5 rnn = hf.RNNet(shape=[1, 5, 1], debug=True, use_GPU=use_GPU, truncation=(3, 3)) rnn.run_batches(inputs, targets, optimizer=HessianFree(CG_iter=100), max_epochs=30, print_period=None) outputs = rnn.forward(inputs, rnn.W) assert rnn.loss.batch_loss(outputs, targets) < 1e-4
def test_integrator(use_GPU): n_inputs = 3 sig_len = 5 inputs = np.outer(np.linspace(0.1, 0.9, n_inputs), np.ones(sig_len))[:, :, None] targets = np.outer(np.linspace(0.1, 0.9, n_inputs), np.linspace(0, 1, sig_len))[:, :, None] inputs = inputs.astype(np.float32) targets = targets.astype(np.float32) rnn = hf.RNNet(shape=[1, 5, 1], debug=True, use_GPU=use_GPU) rnn.run_batches(inputs, targets, optimizer=HessianFree(CG_iter=100), max_epochs=30, print_period=None) outputs = rnn.forward(inputs, rnn.W) assert rnn.loss.batch_loss(outputs, targets) < 1e-4
def plant(plots=True): """Example of a network using a dynamic plant as the output layer.""" n_inputs = 32 sig_len = 15 class Plant(hf.nl.Plant): # this plant implements a simple dynamic system, with two-dimensional # state representing [position, velocity] def __init__(self, A, B, targets, init_state): super(Plant, self).__init__() self.A = np.asarray(A) self.B = B self.targets = targets self.init_state = init_state self.shape = [n_inputs, sig_len, len(A)] # derivative of output with respect to state (constant, so just # compute it once here) self.d_output = np.resize(np.eye( self.shape[-1]), (n_inputs, self.shape[-1], self.shape[-1], 1)) self.reset() def activation(self, x): self.act_count += 1 # this implements a basic s_{t+1} = A*s_t + B*x dynamic system. # but to make things a little more complicated we allow the B # matrix to be dynamic, so it's actually # s_{t+1} = A*s_t + B(s_t)*x self.B_matrix, self.d_B_matrix = self.B(self.state) self.state = (np.dot(self.state, self.A) + np.einsum("ij,ijk->ik", x, self.B_matrix)) return self.state[:x.shape[0]] def d_activation(self, x, _): self.d_act_count += 1 assert self.act_count == self.d_act_count # derivative of state with respect to input d_input = self.B_matrix.transpose((0, 2, 1))[..., None] # derivative of state with respect to previous state d_state = np.resize(self.A.T, np.concatenate(([x.shape[0]], self.A.shape))) d_state[:, 1, 0] += x[:, 1] * self.d_B_matrix[:, 1, 1] d_state = d_state[..., None] return np.concatenate((d_input, d_state, self.d_output), axis=-1) def __call__(self, _): self.inputs = np.concatenate((self.inputs, self.state[:, None, :]), axis=1) return self.state def get_inputs(self): return self.inputs def get_targets(self): return self.targets def reset(self, init=None): self.act_count = 0 self.d_act_count = 0 self.state = (self.init_state.copy() if init is None else init.copy()) self.inputs = np.zeros((self.shape[0], 0, self.shape[-1]), dtype=np.float32) self.B_matrix = self.d_B_matrix = None # static A matrix (converts velocity into a change in position) A = [[1, 0], [0.2, 1]] # dynamic B(s) matrix (converts input into velocity, modulated by current # state) # note that this dynamic B matrix doesn't really make much sense, it's # just here to demonstrate what happens with a plant whose dynamics # change over time def B(state): B = np.zeros((state.shape[0], state.shape[1], state.shape[1])) B[:, 1, 1] = np.tanh(state[:, 0]) d_B = np.zeros((state.shape[0], state.shape[1], state.shape[1])) d_B[:, 1, 1] = 1 - np.tanh(state[:, 0])**2 return B, d_B # random initial position and velocity init_state = np.random.uniform(-0.5, 0.5, size=(n_inputs, 2)) # the target will be to end at position 1 with velocity 0 targets = np.ones((n_inputs, sig_len, 2), dtype=np.float32) targets[:, :, 1] = 0 targets[:, :-1, :] = np.nan plant = Plant(A, B, targets, init_state) rnn = hf.RNNet(shape=[2, 16, 2], layers=[hf.nl.Linear(), hf.nl.Tanh(), plant], W_init_params={"coeff": 0.1}, W_rec_params={"coeff": 0.1}, rng=np.random.RandomState(0)) rnn.run_batches(plant, None, hf.opt.HessianFree(CG_iter=20, init_damping=10), max_epochs=150, plotting=plots) # using gradient descent (for comparison) # rnn.run_batches(plant, None, optimizer=SGD(l_rate=0.01), # batch_size=None, test=test, max_epochs=10000, # plotting=True) if plots: outputs = rnn.forward(plant, rnn.W)[-1] plt.figure() plt.plot(outputs[:, :, 0].squeeze().T) plt.title("position") plt.figure() plt.plot(outputs[:, :, 1].squeeze().T) plt.title("velocity") plt.show()
def adding(T=50, plots=True): """The canonical "adding" test of long-range dependency learning for RNNs. """ # set up inputs N = 100000 test_cut = int(N * 0.9) vals = np.random.uniform(0, 1, size=(N, T, 1)).astype(np.float32) mask = np.zeros((N, T, 1), dtype=np.float32) for m in mask: m[np.random.randint(T / 10)] = 1 m[np.random.randint(T / 10, T / 2)] = 1 inputs = np.concatenate((vals, mask), axis=-1) tmp = np.zeros_like(vals) tmp[mask.astype(np.bool)] = vals[mask.astype(np.bool)] targets = np.zeros((N, T, 1), dtype=np.float32) targets[:] = np.nan targets[:, -1] = np.sum(tmp, axis=1, dtype=np.float32) test = (inputs[test_cut:], targets[test_cut:]) # build network optimizer = hf.opt.HessianFree(CG_iter=60, init_damping=20) W_init_params = {"coeff": 0.25} rnn = hf.RNNet(shape=[2, 32, 64, 1], layers=[ hf.nl.Linear(), hf.nl.ReLU(), hf.nl.Continuous(hf.nl.ReLU(), tau=20), hf.nl.ReLU() ], W_init_params=W_init_params, loss_type=[ hf.loss_funcs.SquaredError(), hf.loss_funcs.StructuralDamping(1e-4, layers=[2], optimizer=optimizer) ], rec_layers=[2], use_GPU=True, debug=False, rng=np.random.RandomState(0)) # scale spectral radius of recurrent weights W, _ = rnn.get_weights(rnn.W, (2, 2)) W *= 1.0 / np.max(np.abs(np.linalg.eigvals(W))) rnn.run_batches(inputs[:test_cut], targets[:test_cut], optimizer=optimizer, batch_size=1024, test=test, max_epochs=50, plotting=plots, test_err=hf.loss_funcs.SquaredError()) if plots: outputs = rnn.forward(inputs[:20], rnn.W) plt.figure() lines = plt.plot(outputs[-1][:].squeeze().T) plt.scatter(np.ones(outputs[-1].shape[0]) * outputs[-1].shape[1], targets[:20, -1], c=[plt.getp(l, "color") for l in lines]) plt.title("outputs") plt.show()
def integrator(model_args=None, run_args=None, n_inputs=15, sig_len=10, plots=True): """Example of a recurrent network, implementing an integrator.""" inputs = np.outer(np.linspace(0.1, 0.9, n_inputs), np.ones(sig_len))[:, :, None] targets = np.outer(np.linspace(0.1, 0.9, n_inputs), np.linspace(0, 1, sig_len))[:, :, None] inputs = inputs.astype(np.float32) targets = targets.astype(np.float32) test = (inputs, targets) if model_args is None: rnn = hf.RNNet(shape=[1, 10, 1], layers=hf.nl.Logistic(), debug=False, use_GPU=False) else: rnn = hf.RNNet(**model_args) if run_args is None: rnn.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=100), test=test, max_epochs=30, plotting=plots) else: CG_iter = run_args.pop("CG_iter", 100) init_damping = run_args.pop("init_damping", 1) rnn.run_batches(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter, init_damping), test=test, plotting=plots, **run_args) # using gradient descent (for comparison) # rnn.run_batches(inputs, targets, optimizer=SGD(l_rate=0.1), # batch_size=None, test=test, max_epochs=10000, # plotting=True) if plots: plt.figure() plt.plot(inputs.squeeze().T) plt.title("inputs") plt.figure() plt.plot(targets.squeeze().T) plt.title("targets") outputs = rnn.forward(inputs, rnn.W)[-1] plt.figure() plt.plot(outputs.squeeze().T) plt.title("outputs") plt.show()
def integrator(model_args=None, run_args=None, n_inputs=15, sig_len=10, plots=True): """A recurrent network implementing an integrator. :param dict model_args: kwargs that will be passed to the :class:`.RNNet` constructor :param dict run_args: kwargs that will be passed to :meth:`.run_epochs` :param int n_inputs: size of batch to train on :param int sig_len: number of timesteps to run for :param bool plots: display plots of trained output """ inputs = np.outer(np.linspace(0.1, 0.9, n_inputs), np.ones(sig_len))[:, :, None] targets = np.outer(np.linspace(0.1, 0.9, n_inputs), np.linspace(0, 1, sig_len))[:, :, None] inputs = inputs.astype(np.float32) targets = targets.astype(np.float32) test = (inputs, targets) if model_args is None: rnn = hf.RNNet(shape=[1, 10, 1], layers=hf.nl.Logistic(), debug=False, use_GPU=False) else: rnn = hf.RNNet(**model_args) if run_args is None: rnn.run_epochs(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter=100), test=test, max_epochs=30, plotting=plots) else: CG_iter = run_args.pop("CG_iter", 100) init_damping = run_args.pop("init_damping", 1) rnn.run_epochs(inputs, targets, optimizer=hf.opt.HessianFree(CG_iter, init_damping), test=test, plotting=plots, **run_args) if plots: plt.figure() plt.plot(inputs.squeeze().T) plt.title("inputs") plt.figure() plt.plot(targets.squeeze().T) plt.title("targets") outputs = rnn.forward(inputs)[-1] plt.figure() plt.plot(outputs.squeeze().T) plt.title("outputs") plt.show()
def test_plant(use_GPU): n_inputs = 32 sig_len = 15 class Plant(hf.nl.Plant): # this plant implements a simple dynamic system, with two-dimensional # state representing [position, velocity] def __init__(self, A, B, targets, init_state): super(Plant, self).__init__(stateful=True) self.A = np.asarray(A) self.B = B self.targets = targets self.init_state = init_state self.shape = [n_inputs, sig_len, len(A)] # derivative of output with respect to state (constant, so just # compute it once here) self.d_output = np.resize(np.eye( self.shape[-1]), (n_inputs, self.shape[-1], self.shape[-1], 1)) self.reset() def activation(self, x): self.act_count += 1 # this implements a basic s_{t+1} = A*s_t + B*x dynamic system. # but to make things a little more complicated we allow the B # matrix to be dynamic, so it's actually # s_{t+1} = A*s_t + B(s_t)*x self.B_matrix, self.d_B_matrix = self.B(self.state) self.state = (np.dot(self.state, self.A) + np.einsum("ij,ijk->ik", x, self.B_matrix)) return self.state[:x.shape[0]] def d_activation(self, x, _): self.d_act_count += 1 assert self.act_count == self.d_act_count # derivative of state with respect to input d_input = self.B_matrix.transpose((0, 2, 1))[..., None] # derivative of state with respect to previous state d_state = np.resize(self.A.T, np.concatenate(([x.shape[0]], self.A.shape))) d_state[:, 1, 0] += x[:, 1] * self.d_B_matrix[:, 1, 1] d_state = d_state[..., None] return np.concatenate((d_input, d_state, self.d_output), axis=-1) def __call__(self, _): self.inputs = np.concatenate((self.inputs, self.state[:, None, :]), axis=1) return self.state def get_inputs(self): return self.inputs def get_targets(self): return self.targets def reset(self, init=None): self.act_count = 0 self.d_act_count = 0 self.state = (self.init_state.copy() if init is None else init.copy()) self.inputs = np.zeros((self.shape[0], 0, self.shape[-1]), dtype=np.float32) self.B_matrix = self.d_B_matrix = None # static A matrix (converts velocity into a change in position) A = [[1, 0], [0.2, 1]] # dynamic B(s) matrix (converts input into velocity, modulated by current # state) # note that this dynamic B matrix doesn't really make much sense, it's # just here to demonstrate what happens with a plant whose dynamics # change over time def B(state): B = np.zeros((state.shape[0], state.shape[1], state.shape[1])) B[:, 1, 1] = np.tanh(state[:, 0]) d_B = np.zeros((state.shape[0], state.shape[1], state.shape[1])) d_B[:, 1, 1] = 1 - np.tanh(state[:, 0])**2 return B, d_B # initial position init_state = np.zeros((n_inputs, 2)) init_state[:, 0] = np.linspace(-1, 1, n_inputs) # the target will be to end at position 1 with velocity 0 targets = np.ones((n_inputs, sig_len, 2), dtype=np.float32) targets[:, :, 1] = 0 targets[:, :-1, :] = np.nan plant = Plant(A, B, targets, init_state) rnn = hf.RNNet(shape=[2, 16, 2], layers=[Linear(), Tanh(), plant], W_init_params={"coeff": 0.1}, W_rec_params={"coeff": 0.1}, use_GPU=use_GPU, rng=np.random.RandomState(0), debug=False) rnn.run_batches(plant, None, HessianFree(CG_iter=20, init_damping=10), max_epochs=150, plotting=True, print_period=None) outputs = rnn.forward(plant, rnn.W) try: assert rnn.loss.batch_loss(outputs, targets) < 1e-2 except AssertionError: plt.figure() plt.plot(outputs[-1][:, :, 0].squeeze().T) plt.plot(outputs[-1][:, :, 1].squeeze().T) plt.title("outputs") plt.savefig("test_plant_outputs.png") raise