def __init__(self, n_in, n_out, init_w=Normal(), init_b=Normal()): W = init_w((n_in, n_out)) b = init_b((n_out, )) params = Vars(W=W, b=b) dW = np.zeros_like(W) db = np.zeros_like(b) grads = Vars(W=dW, b=db) self.parametrize(params, grads)
def __init__(self, n_tokens, n_dims, init_fn=Normal()): self.n_tokens = n_tokens self.n_dims = n_dims W = init_fn((n_tokens, n_dims)) params = Vars(W=W) dW = np.zeros_like(W) grads = Vars(W=dW) self.parametrize(params, grads)
def __init__(self, l, f=lambda x: x): self.n = 0 self.mu = 0 self.m2 = 0 self.sd = 0 self.lo = Vars().lo self.hi = Vars().hi self.some = Sample() self.w = 1 for i in l: self.numInc(f(i))
def __init__(self, layers): self.layers = layers params = {} grads = {} for i, layer in enumerate(layers): if isinstance(layer, ParametrizedBlock): for param_name in layer.params: key = "%.2d__%s" % (i, param_name, ) params[key] = layer.params[param_name] grads[key] = layer.grads[param_name] self.parametrize(Vars(**params), Vars(**grads))
def __init__(self, n_in, n_out): self.n_cells = n_out WLSTM = np.random.randn(n_in + n_out + 1, 4 * n_out) / np.sqrt(n_in + n_out) WLSTM[0, :] = 0 # initialize biases to zero WLSTM[0, n_out:2 * n_out] = 3 params = Vars(WLSTM=WLSTM) grads = Vars(WLSTM=np.zeros_like(WLSTM)) self.parametrize(params, grads)
def __init__(self, n_hidden): self.n_hid = n_hidden Wy = np.random.randn(n_hidden, n_hidden) Wh = np.random.randn(n_hidden, n_hidden) w = np.random.randn(n_hidden) params = Vars(Wh=Wh, Wy=Wy, w=w) grads = Vars(Wh=np.zeros_like(Wh), Wy=np.zeros_like(Wy), w=np.zeros_like(w)) self.parametrize(params, grads)
class Sequential(ParametrizedBlock): """Chains several layer one on top of each other.""" def __init__(self, layers): self.layers = layers params = {} grads = {} for i, layer in enumerate(layers): if isinstance(layer, ParametrizedBlock): for param_name in layer.params: key = "%.2d__%s" % (i, param_name, ) params[key] = layer.params[param_name] grads[key] = layer.grads[param_name] self.parametrize(Vars(**params), Vars(**grads)) def forward(self, (x, )): yaux = [] last_y = x for layer in self.layers: ((y, ), y_aux) = layer.forward((last_y, )) yaux.append(y_aux) last_y = y aux = Vars( yaux=yaux ) return ((last_y, ), aux)
class Tanh(Block): @classmethod def forward(self, (x, )): y = np.tanh(x) aux = Vars(y=y) return ((y, ), aux)
def parametrize_from_layers(self, layers, layer_names): params = {} grads = {} for layer_name, layer in zip(layer_names, layers): if isinstance(layer, ParametrizedBlock): for param_name in layer.params: key = "%s__%s" % ( layer_name, param_name, ) params[key] = layer.params[param_name] grads[key] = layer.grads[param_name] else: assert False, "Layer is not a ParametrizedBlock. Perhaps error?" self.parametrize(Vars(**params), Vars(**grads))
class Embeddings(ParametrizedBlock): """Embedding layer. Takes a tensor of integres as input and returns a tensor one order greater with the last dimension being n_dims where the integer ids are mapped through parameter matrix W to their embeddings.""" def __init__(self, n_tokens, n_dims, init_fn=Normal()): self.n_tokens = n_tokens self.n_dims = n_dims W = init_fn((n_tokens, n_dims)) params = Vars(W=W) dW = np.zeros_like(W) grads = Vars(W=dW) self.parametrize(params, grads) def size(self): return self.n_dims def forward(self, (x, )): """Map input indicies to embedding vectors.""" W = self.params['W'] assert x.ndim == 1, 'Cannot embed non-vector arrays.' y = W[x] aux = Vars( x=x ) return ((y, ), aux, )
class Sigmoid(Block): @classmethod def forward(self, (x, )): y = 0.5 * (1 + np.tanh(0.5 * x)) aux = Vars(y=y) return ((y, ), aux)
class Switch(Block): @classmethod def forward(self, (p1, in1, in2)): res = p1 * in1 + (1 - p1) * in2 aux = Vars(p1=p1, in1=in1, in2=in2) return ((res, ), aux)
class Dot(ParametrizedBlock): """Dot product.""" @classmethod def forward(self, (A, B)): if A.ndim == 1: A = A[np.newaxis, :] if B.ndim == 1: B = B[:, np.newaxis] aux = Vars(A=A, B=B) return ((np.dot(A, B), ), aux)
class Softmax(Block): """Compute softmax of the input.""" @classmethod def forward(self, (x, )): xmax = x.max(axis=x.ndim - 1, keepdims=True) res = np.exp(x - xmax) ndx = ((slice(None), ) * (len(x.shape) - 1)) + (None, ) res = res / np.sum(res, axis=len(x.shape) - 1)[ndx] aux = Vars(y=res) return ( (res, ), aux, )
class LinearLayer(ParametrizedBlock): """Affine transformation.""" def __init__(self, n_in, n_out, init_w=Normal(), init_b=Normal()): W = init_w((n_in, n_out)) b = init_b((n_out, )) params = Vars(W=W, b=b) dW = np.zeros_like(W) db = np.zeros_like(b) grads = Vars(W=dW, b=db) self.parametrize(params, grads) def forward(self, (x, )): W = self.params['W'] b = self.params['b'] y = np.dot(x, W) + b aux = Vars(y=y, x=x) return ((y, ), aux)
class Attention(ParametrizedBlock): def __init__(self, n_hidden): self.n_hid = n_hidden Wy = np.random.randn(n_hidden, n_hidden) Wh = np.random.randn(n_hidden, n_hidden) w = np.random.randn(n_hidden) params = Vars(Wh=Wh, Wy=Wy, w=w) grads = Vars(Wh=np.zeros_like(Wh), Wy=np.zeros_like(Wy), w=np.zeros_like(w)) self.parametrize(params, grads) @timeit def forward(self, (h_out, g_t, emb_in)): Wy = self.params['Wy'] Wh = self.params['Wh'] w = self.params['w'] Y = h_out n_inputs = len(h_out) ((Wy_apply, ), Wy_aux) = Dot.forward(( h_out, Wy, )) ((Wh_apply, ), Wh_aux) = Dot.forward((g_t, Wh)) Wh_dot_g_t_rep = np.repeat(Wh_apply, n_inputs, axis=0) Mx = Wy_apply + Wh_dot_g_t_rep ((M, ), M_aux) = Tanh.forward((Mx, )) ((Mw, ), Mw_aux) = Dot.forward((M, w)) MwT = Mw.T ((alpha, ), alpha_aux) = Softmax.forward((MwT, )) alphaT = alpha.T query = (emb_in * alphaT).sum(axis=0) aux = Vars(h_out=h_out, g_t=g_t, emb_in=emb_in, Wy_dot_h_out=Wy_apply, Wh_dot_g_t=Wh_apply, Wh_aux=Wh_aux, Wy_aux=Wy_aux, M=M, M_aux=M_aux, Mw_aux=Mw_aux, Mx=Mx, MwT=MwT, alpha=alpha, alpha_aux=alpha_aux) return ((query, ), aux)
class LSTM(ParametrizedBlock): def __init__(self, n_in, n_out): self.n_cells = n_out WLSTM = np.random.randn(n_in + n_out + 1, 4 * n_out) / np.sqrt(n_in + n_out) WLSTM[0, :] = 0 # initialize biases to zero WLSTM[0, n_out:2 * n_out] = 3 params = Vars(WLSTM=WLSTM) grads = Vars(WLSTM=np.zeros_like(WLSTM)) self.parametrize(params, grads) def get_init(self): return (np.zeros((self.n_cells, )), np.zeros((self.n_cells, ))) def get_init_grad(self): return (np.zeros((self.n_cells, )), np.zeros((self.n_cells, ))) @timeit def forward(self, (x, h0, c0)): """ X should be of shape (t,b,input_size), where t = length of sequence, b = batch size """ WLSTM = self.params['WLSTM'] n, b, input_size = x.shape d = WLSTM.shape[1] / 4 # hidden size #if c0 is None: c0 = np.zeros((b,d)) #if h0 is None: h0 = np.zeros((b,d)) # Perform the LSTM forward pass with x as the input xphpb = WLSTM.shape[0] # x plus h plus bias, lol Hin = np.zeros( (n, b, xphpb)) # input [1, xt, ht-1] to each tick of the LSTM Hout = np.zeros( (n, b, d)) # hidden representation of the LSTM (gated cell content) IFOG = np.zeros((n, b, d * 4)) # input, forget, output, gate (IFOG) IFOGf = np.zeros((n, b, d * 4)) # after nonlinearity C = np.zeros((n, b, d)) # cell content Ct = np.zeros((n, b, d)) # tanh of cell content for t in xrange(n): # concat [x,h] as input to the LSTM prevh = Hout[t - 1] if t > 0 else h0 Hin[t, :, 0] = 1 # bias Hin[t, :, 1:input_size + 1] = x[t] Hin[t, :, input_size + 1:] = prevh # compute all gate activations. dots: (most work is this line) IFOG[t] = Hin[t].dot(WLSTM) # non-linearities IFOGf[t, :, :3 * d] = 1.0 / (1.0 + np.exp(-IFOG[t, :, :3 * d]) ) # sigmoids; these are the gates IFOGf[t, :, 3 * d:] = np.tanh(IFOG[t, :, 3 * d:]) # tanh # compute the cell activation prevc = C[t - 1] if t > 0 else c0 C[t] = IFOGf[t, :, :d] * IFOGf[t, :, 3 * d:] + IFOGf[t, :, d:2 * d] * prevc Ct[t] = np.tanh(C[t]) Hout[t] = IFOGf[t, :, 2 * d:3 * d] * Ct[t] cache = {} cache['WLSTM'] = WLSTM cache['Hout'] = Hout cache['IFOGf'] = IFOGf cache['IFOG'] = IFOG cache['C'] = C cache['Ct'] = Ct cache['Hin'] = Hin cache['c0'] = c0 cache['h0'] = h0 aux = Vars(**cache) return ((Hout, C), aux) # TODO: Do proper gradient backward for C