def grad_y(self, Y): #ANS = 0*g.zeros((len(X),self.input_size)) #X undefined? ANS = 0 * g.zeros((len(Y), self.input_size)) #X undefined? for (f, (ir0, ir1), (or0, or1)) in zip(self.fns, self.input_ranges, self.output_ranges): ANS[:, ir0:ir1] = f.grad_y(Y[:, ir0:ir1]) return ANS
def __init__(self, *args): assert len(args) % 2 == 0 fns, sizes, weights = args[::3], args[1::3], args[2::3] from ersatz.mrnn.pylab import array input_sizes = array([x for (x, y) in sizes]) output_sizes = array([y for (x, y) in sizes]) self.fns = fns self.input_sizes = input_sizes self.output_sizes = output_sizes self.weights = array(weights) self.input_ranges = [] ptr = 0 for s in input_sizes: a = ptr b = ptr + s self.input_ranges.append((a, b)) ptr = b assert ptr == input_sizes.sum() self.output_ranges = [] ptr = 0 for s in output_sizes: a = ptr b = ptr + s self.output_ranges.append((a, b)) ptr = b assert ptr == output_sizes.sum() self.input_size = input_sizes.sum() self.output_size = output_sizes.sum() self.loss_acc = g.zeros(len(fns))
def __init__( self, # give it what it needs: sizes etc v, h, f, o, hid_nonlin, # the nonlinearities out_nonlin, # the output nonlinearity struct_damp_nonlin=nonlin. Tanh, # the structural damping nonlinearity init=True, number_of_timesteps_to_use=99999999): """ v int number of features h int number of hidden units f int number of factored units o int hid_nonlin function hidden layer nonlinearity function out_nonlin function output nonlinearity function struct_damp_nonlin function init bool whether to initialize arrays to random data """ self.v = v self.h = h self.f = f self.o = o self.hid_nonlin = hid_nonlin self.out_nonlin = out_nonlin self.struct_damp_nonlin = struct_damp_nonlin self.number_of_timesteps_to_use = number_of_timesteps_to_use if init: self.h_init = g.randn(1, h) self.W_hf = g.randn(h, f) self.W_fh = g.randn(f, h) #self.W_hh = g.randn(h, h) self.f_bias = g.zeros((1, f)) self.W_vh = g.randn(v, h) self.W_vf = g.randn(v, f) self.W_ho = g.randn(h, o) if hid_nonlin is None: hid_nonlin = nonlin.Tanh self.hid_nonlin = hid_nonlin if out_nonlin is None: out_nonlin = nonlin.Lin self.out_nonlin = out_nonlin
def __call__(self, X): b, y = X.shape assert y % 2 == 0 b = X[:, :y / 2] a_ = X[:, y / 2:] a = self.f(a_) ans = g.zeros(X.shape) ans[:, :y / 2] = b / a ans[:, y / 2:] = a return ans
def grad(self, XP, O, M): b, y = XP.shape assert y % 2 == 0 b = XP[:, :y / 2] a_ = XP[:, y / 2:] a = self.f(a_) a_d = self.f_prime_y(a) G = g.zeros(XP.shape) m = b / a # that's good G[:, :y / 2] = -(O - m) ## try the A gradients: right. s = 1. / a E_OO = s + m * m ## that's good oto. G[:, y / 2:] = (O * O - E_OO) * 0.5 * a_d return G * M
def __call__(self, x): """ Return batch number x x: int return: V,O,P arrays """ if x < 0: mode = 'test' else: mode = 'train' batch_id = x batch = self.create_batch(mode, batch_id) T = batch.shape[0] Vs = [] Os = [] Ms = [] for t, timestep in enumerate(batch): num_features = timestep.shape[1] timestep = timestep[~np.isnan(timestep)].reshape( (-1, num_features)) if timestep.size == 0: # if timestep contains only nans, then we at the end of # samples and in this batch no one sample has this # timestep, stop processing next timesteps break batch_size = timestep.shape[0] v = timestep[:, :-self.O] o = timestep[:, -self.O:] if mode == 'train' and t < T - self.true_T: m = g.zeros((batch_size, 1)) else: m = g.ones((batch_size, 1)) Vs.append(g.garray(v)) Os.append(g.garray(o)) Ms.append(m) return Vs, Os, Ms
def H_prod(self, X, P, M=1, H_damping=1): b, y = P.shape assert y % 2 == 0 m = P[:, :y / 2] # m is first. a = P[:, y / 2:] a_d = self.f_prime_y(a) s = 1 / a R_b = X[:, :y / 2] R_a_ = X[:, y / 2:] A_00 = (s) A_01 = A_10 = (-s * m) * a_d A_11 = (s * m**2 + .5 * s**2) * a_d**2 ANS = g.zeros(P.shape) ANS[:, :y / 2] = R_b * A_00 + R_a_ * A_01 ANS[:, y / 2:] = R_b * A_10 + R_a_ * (A_11 + max(H_damping, self.min_H_damping)) return ANS * M
def backward_pass(self, state, dOX, R_state=None, mu=0., compute_grad2=False): # backprop. if R_state is None: R_HX, R_OX = None, None else: R_HX, R_OX = R_state V, A, B, H, OX = state if V[0] is not None: V = [None] + V # if A[0] is not None: # A = [None] + A # if B[0] is not None: # B = [None] + B # if H[0] is not None: # H = [None] + H if OX[0] is not None: OX = [None] + OX if dOX[0] is not None: dOX = [None] + dOX T = len(V) - 1 grad = self.unpack(self.pack() * 0) if compute_grad2: grad2 = self.unpack(self.pack() * 0) else: grad2 = None dH_1t = g.zeros(H[T].shape) prev_batch_size = H[T].shape[0] for t in reversed(range(1, T + 1)): dH_t = g.dot(dOX[t], self.W_ho.T) dH_t[:prev_batch_size, :] += dH_1t grad.W_ho += g.dot(H[t].T, dOX[t]) if compute_grad2: grad2.W_ho += g.dot((H[t] * H[t]).T, dOX[t] * dOX[t]) ## backpropagate the nonlinearity: at this point, dHX_t, the gradinet ## wrt the total inputs to H_t, is correct. dHX_t = dH_t * self.hid_nonlin.grad_y(H[t]) ## Add the structured reg at this point: That's good. if R_HX is not None: dHX_t += float(mu) * self.struct_damp_nonlin.H_prod( R_HX[t], H[t], M=1) ## had hh grad here: (H[t-1], dHX_t) B_t_f = (B[t] + self.f_bias) AB = A[t] * B_t_f grad.W_fh += g.dot(AB.T, dHX_t) grad.W_vh += g.dot(V[t].T, dHX_t) if compute_grad2: _dHX2 = dHX_t * dHX_t grad2.W_fh += g.dot((AB * AB).T, _dHX2) grad2.W_vh += g.dot((V[t] * V[t]).T, _dHX2) ## do the intermediate backprop: dAB = g.dot(dHX_t, self.W_fh.T) dB = dAB * A[t] grad.f_bias += dB.sum(0) dBB = dB * (1 - B[t] * B[t]) grad.W_vf += g.dot(V[t].T, dBB) dA = dAB * B_t_f Ht_minus_one = H[t - 1][:dA.shape[0], :] grad.W_hf += g.dot(Ht_minus_one.T, dA) if compute_grad2: grad2.f_bias += (dB * dB).sum(0) grad2.W_vf += g.dot((V[t] * V[t]).T, dBB * dBB) grad2.W_hf += g.dot((Ht_minus_one * Ht_minus_one).T, dA * dA) dH_1t = g.dot(dA, self.W_hf.T) prev_batch_size = V[t].shape[0] grad.h_init += dH_1t.sum(0) if compute_grad2: grad2.h_init += (dH_1t * dH_1t).sum(0) return grad, grad2