def __init__(self, shape): self.in_size, self.out_size = shape self.W = init_weights(shape) self.b = init_bias(self.out_size) self.gW = init_gradws(shape) self.gb = init_bias(self.out_size) D, X = T.matrices("D", "X") def _active(X): return T.nnet.sigmoid(T.dot(X, self.W) + self.b) self.active = theano.function(inputs = [X], outputs = _active(X)) def _derive(D, X): return D * ((1 - X) * X) self.derive = theano.function( inputs = [D, X], outputs = _derive(D, X) ) def _propagate(D): return T.dot(D, self.W.T) self.propagate = theano.function(inputs = [D], outputs = _propagate(D)) x, dy = T.rows("x","dy") updates_grad = [(self.gW, self.gW + T.dot(x.T, dy)), (self.gb, self.gb + dy)] self.grad = theano.function( inputs = [x, dy], updates = updates_grad ) updates_clear = [ (self.gW, self.gW * 0), (self.gb, self.gb * 0)] self.clear_grad = theano.function( inputs = [], updates = updates_clear ) lr = T.scalar() t = T.scalar() updates_w = [ (self.W, self.W - self.gW * lr / t), (self.b, self.b - self.gb * lr / t)] self.update = theano.function( inputs = [lr, t], updates = updates_w )
def __init__(self, shape, X): prefix = "Softmax_" self.in_size, self.out_size = shape self.W = init_weights(shape, prefix + "W") self.b = init_bias(self.out_size, prefix + "b") self.gW = init_gradws(shape, prefix + "gW") self.gb = init_bias(self.out_size, prefix + "gb") D = T.matrices("D") self.X = X def _active(X): return T.nnet.softmax(T.dot(X, self.W) + self.b) self.active = theano.function(inputs = [self.X], outputs = _active(self.X)) def _propagate(D): return T.dot(D, self.W.T) self.propagate = theano.function(inputs = [D], outputs = _propagate(D)) x, dy = T.rows("x","dy") updates_grad = [(self.gW, self.gW + T.dot(x.T, dy)), (self.gb, self.gb + dy)] self.grad = theano.function( inputs = [x, dy], updates = updates_grad ) updates_clear = [ (self.gW, self.gW * 0), (self.gb, self.gb * 0)] self.clear_grad = theano.function( inputs = [], updates = updates_clear ) lr = T.scalar() t = T.scalar() updates_w = [ (self.W, self.W - self.gW * lr / t), (self.b, self.b - self.gb * lr / t)] self.update = theano.function( inputs = [lr, t], updates = updates_w ) self.params = [self.W, self.b]
def __init__(self, rng, layer, shape, X, is_train = 1, batch_size = 1, p = 0.5): prefix = "GRU_" self.in_size, self.out_size = shape self.W_xr = init_weights((self.in_size, self.out_size), prefix + "W_xr" + "_" + layer) self.W_hr = init_weights((self.out_size, self.out_size), prefix + "W_hr" + "_" + layer) self.b_r = init_bias(self.out_size, prefix + "b_r" + "_" + layer) self.W_xz = init_weights((self.in_size, self.out_size), prefix + "W_xz" + "_" + layer) self.W_hz = init_weights((self.out_size, self.out_size), prefix + "W_hz" + "_" + layer) self.b_z = init_bias(self.out_size, prefix + "b_z" + "_" + layer) self.W_xh = init_weights((self.in_size, self.out_size), prefix + "W_xh" + "_" + layer) self.W_hh = init_weights((self.out_size, self.out_size), prefix + "W_hh" + "_" + layer) self.b_h = init_bias(self.out_size, prefix + "b_h" + "_" + layer) # for gradients self.gW_xr = init_gradws((self.in_size, self.out_size), prefix + "gW_xr" + "_" + layer) self.gW_hr = init_gradws((self.out_size, self.out_size), prefix + "gW_h" + "_" + layer) self.gb_r = init_bias(self.out_size, prefix + "gb_r" + "_" + layer) self.gW_xz = init_gradws((self.in_size, self.out_size), prefix + "gW_xz" + "_" + layer) self.gW_hz = init_gradws((self.out_size, self.out_size), prefix + "gW_hz" + "_" + layer) self.gb_z = init_bias(self.out_size, prefix + "gb_z" + "_" + layer) self.gW_xh = init_gradws((self.in_size, self.out_size), prefix + "gW_xh" + "_" + layer) self.gW_hh = init_gradws((self.out_size, self.out_size), prefix + "gW_hh" + "_" + layer) self.gb_h = init_bias(self.out_size, prefix + "gb_h" + "_" + layer) def _active(x, pre_h): r = T.nnet.sigmoid(T.dot(x, self.W_xr) + T.dot(pre_h, self.W_hr) + self.b_r) z = T.nnet.sigmoid(T.dot(x, self.W_xz) + T.dot(pre_h, self.W_hz) + self.b_z) gh = T.tanh(T.dot(x, self.W_xh) + T.dot(r * pre_h, self.W_hh) + self.b_h) h = z * pre_h + (1 - z) * gh return r, z, gh, h self.X = X H = T.matrix("H") [r, z, gh, h], updates = theano.scan(_active, sequences=[self.X], outputs_info=[None, None, None, H]) self.active = theano.function( inputs = [self.X, H], outputs = [r, z, gh, h] ) h = T.reshape(h, (self.X.shape[0], self.out_size)) # dropout if p > 0: srng = T.shared_randomstreams.RandomStreams(rng.randint(999999)) mask = srng.binomial(n = 1, p = 1-p, size = h.shape, dtype = theano.config.floatX) self.activation = T.switch(T.eq(is_train, 1), h * mask, h * (1 - p)) # is_train = 1 else: self.activation = T.switch(T.eq(is_train, 1), h, h) # is_train # TODO ->scan def _derive(prop, r, post_r, z, gh, pre_h, post_dh, post_dgh, post_dr, post_dz): dh = prop + T.dot(post_dr, self.W_hr.T) + T.dot(post_dz, self.W_hz.T) + T.dot(post_dgh * post_r, self.W_hh.T) + post_dh * z dgh = dh * (1 - z) * (1 - gh ** 2) dr = T.dot(dgh * pre_h, self.W_hh.T) * ((1 - r) * r) dz = (dh * (pre_h - gh)) * ((1 - z) * z) return dh, dgh, dr, dz prop, r, z, gh, pre_h, post_dh, post_dgh, post_dr, post_dz, post_r = \ T.matrices("prop", "r", "z", "gh", "pre_h", "post_dh", "post_dgh", "post_dr", "post_dz", "post_r") self.derive = theano.function( inputs = [prop, r, post_r, z, gh, pre_h, post_dh, post_dgh, post_dr, post_dz], outputs = _derive(prop, r, post_r, z, gh, pre_h, post_dh, post_dgh, post_dr, post_dz) ) x, dz, dr, dgh = T.rows("x", "dz", "dr", "dgh") updates_grad = [(self.gW_xr, self.gW_xr + T.dot(x.T, dr)), (self.gW_xz, self.gW_xz + T.dot(x.T, dz)), (self.gW_xh, self.gW_xh + T.dot(x.T, dgh)), (self.gW_hr, self.gW_hr + T.dot(pre_h.T, dr)), (self.gW_hz, self.gW_hz + T.dot(pre_h.T, dz)), (self.gW_hh, self.gW_hh + T.dot((r * pre_h).T, dgh)), (self.gb_r, self.gb_r + dr), (self.gb_z, self.gb_z + dz), (self.gb_h, self.gb_h + dgh)] self.grad = theano.function( inputs = [x, r, pre_h, dz, dr, dgh], updates = updates_grad ) updates_clear = [ (self.gW_xr, self.gW_xr * 0), (self.gW_xz, self.gW_xz * 0), (self.gW_xh, self.gW_xh * 0), (self.gW_hr, self.gW_hr * 0), (self.gW_hz, self.gW_hz * 0), (self.gW_hh, self.gW_hh * 0), (self.gb_r, self.gb_r * 0), (self.gb_z, self.gb_z * 0), (self.gb_h, self.gb_h * 0)] self.clear_grad = theano.function( inputs = [], updates = updates_clear ) lr = T.scalar() t = T.scalar() tm1 = T.scalar() updates_w = [ (self.W_xr, self.W_xr - self.gW_xr * lr / t), (self.W_xz, self.W_xz - self.gW_xz * lr / t), (self.W_xh, self.W_xh - self.gW_xh * lr / t), (self.W_hr, self.W_hr - self.gW_hr * lr / tm1), (self.W_hz, self.W_hz - self.gW_hz * lr / tm1), (self.W_hh, self.W_hh - self.gW_hh * lr / tm1), (self.b_r, self.b_r - self.gb_r * lr / t), (self.b_z, self.b_z - self.gb_z * lr / t), (self.b_h, self.b_h - self.gb_h * lr / t)] self.update = theano.function( inputs = [lr, t, tm1], updates = updates_w ) DZ, DR, DGH = T.matrices("DZ", "DR", "DGH") def _propagate(DR, DZ, DGH): return (T.dot(DR, self.W_xr.T) + T.dot(DZ, self.W_xz.T) + T.dot(DGH, self.W_xh.T)) self.propagate = theano.function(inputs = [DR, DZ, DGH], outputs = _propagate(DR, DZ, DGH)) self.params = [self.W_xr, self.W_hr, self.b_r, self.W_xz, self.W_hz, self.b_z, self.W_xh, self.W_hh, self.b_h]