def __init__(self, shape):
        self.in_size, self.out_size = shape

        self.W = init_weights(shape)
        self.b = init_bias(self.out_size)

        self.gW = init_gradws(shape)
        self.gb = init_bias(self.out_size)

        D, X = T.matrices("D", "X")
        def _active(X):
            return T.nnet.sigmoid(T.dot(X, self.W) + self.b)
        self.active = theano.function(inputs = [X], outputs = _active(X))
        
        def _derive(D, X):
            return D * ((1 - X) * X)
        self.derive = theano.function(
            inputs = [D, X],
            outputs = _derive(D, X)
        )

        def _propagate(D):
            return T.dot(D, self.W.T)
        self.propagate = theano.function(inputs = [D], outputs = _propagate(D))

        x, dy = T.rows("x","dy")
        updates_grad = [(self.gW, self.gW + T.dot(x.T, dy)),
               (self.gb, self.gb + dy)]
        self.grad = theano.function(
            inputs = [x, dy],
            updates = updates_grad
        )

        updates_clear = [
               (self.gW, self.gW * 0),
               (self.gb, self.gb * 0)]
        self.clear_grad = theano.function(
            inputs = [],
            updates = updates_clear
        )

        lr = T.scalar()
        t = T.scalar()
        updates_w = [
               (self.W, self.W - self.gW * lr / t),
               (self.b, self.b - self.gb * lr / t)]
        self.update = theano.function(
            inputs = [lr, t],
            updates = updates_w
        )
    def __init__(self, shape, X):
        prefix = "Softmax_"
        self.in_size, self.out_size = shape
        self.W = init_weights(shape, prefix + "W")
        self.b = init_bias(self.out_size, prefix + "b")

        self.gW = init_gradws(shape, prefix + "gW")
        self.gb = init_bias(self.out_size, prefix + "gb")

        D = T.matrices("D")
        self.X = X
        def _active(X):
            return T.nnet.softmax(T.dot(X, self.W) + self.b)
        self.active = theano.function(inputs = [self.X], outputs = _active(self.X))

        def _propagate(D):
            return T.dot(D, self.W.T)
        self.propagate = theano.function(inputs = [D], outputs = _propagate(D))

        x, dy = T.rows("x","dy")
        updates_grad = [(self.gW, self.gW + T.dot(x.T, dy)),
               (self.gb, self.gb + dy)]
        self.grad = theano.function(
            inputs = [x, dy],
            updates = updates_grad
        )

        updates_clear = [
               (self.gW, self.gW * 0),
               (self.gb, self.gb * 0)]
        self.clear_grad = theano.function(
            inputs = [],
            updates = updates_clear
        )

        lr = T.scalar()
        t = T.scalar()
        updates_w = [
               (self.W, self.W - self.gW * lr / t),
               (self.b, self.b - self.gb * lr / t)]
        self.update = theano.function(
            inputs = [lr, t],
            updates = updates_w
        )

        self.params = [self.W, self.b]
Example #3
0
    def __init__(self, rng, layer, shape, X, is_train = 1, batch_size = 1, p = 0.5):
        prefix = "GRU_"
        self.in_size, self.out_size = shape

        self.W_xr = init_weights((self.in_size, self.out_size), prefix + "W_xr" + "_" + layer)
        self.W_hr = init_weights((self.out_size, self.out_size), prefix + "W_hr" + "_" + layer)
        self.b_r = init_bias(self.out_size, prefix + "b_r" + "_" + layer)
        
        self.W_xz = init_weights((self.in_size, self.out_size), prefix + "W_xz" + "_" + layer)
        self.W_hz = init_weights((self.out_size, self.out_size), prefix + "W_hz" + "_" + layer)
        self.b_z = init_bias(self.out_size, prefix + "b_z" + "_" + layer)

        self.W_xh = init_weights((self.in_size, self.out_size), prefix + "W_xh" + "_" + layer)
        self.W_hh = init_weights((self.out_size, self.out_size), prefix + "W_hh" + "_" + layer)
        self.b_h = init_bias(self.out_size, prefix + "b_h" + "_" + layer)

        

        # for gradients
        self.gW_xr = init_gradws((self.in_size, self.out_size), prefix + "gW_xr" + "_" + layer)
        self.gW_hr = init_gradws((self.out_size, self.out_size), prefix + "gW_h" + "_" + layer)
        self.gb_r = init_bias(self.out_size, prefix + "gb_r" + "_" + layer)
        
        self.gW_xz = init_gradws((self.in_size, self.out_size), prefix + "gW_xz" + "_" + layer)
        self.gW_hz = init_gradws((self.out_size, self.out_size), prefix + "gW_hz" + "_" + layer)
        self.gb_z = init_bias(self.out_size, prefix + "gb_z" + "_" + layer)

        self.gW_xh = init_gradws((self.in_size, self.out_size), prefix + "gW_xh" + "_" + layer)
        self.gW_hh = init_gradws((self.out_size, self.out_size), prefix + "gW_hh" + "_" + layer)
        self.gb_h = init_bias(self.out_size, prefix + "gb_h" + "_" + layer)
    
        def _active(x, pre_h):
            r = T.nnet.sigmoid(T.dot(x, self.W_xr) + T.dot(pre_h, self.W_hr) + self.b_r)
            z = T.nnet.sigmoid(T.dot(x, self.W_xz) + T.dot(pre_h, self.W_hz) + self.b_z)
            gh = T.tanh(T.dot(x, self.W_xh) + T.dot(r * pre_h, self.W_hh) + self.b_h)
            h = z * pre_h + (1 - z) * gh
            return r, z, gh, h
        self.X = X
        H = T.matrix("H")
        [r, z, gh, h], updates = theano.scan(_active, sequences=[self.X], outputs_info=[None, None, None, H])
        self.active = theano.function(
            inputs = [self.X, H],
            outputs = [r, z, gh, h]
        )

        h = T.reshape(h, (self.X.shape[0], self.out_size))
        # dropout
        if p > 0:
            srng = T.shared_randomstreams.RandomStreams(rng.randint(999999))
            mask = srng.binomial(n = 1, p = 1-p, size = h.shape, dtype = theano.config.floatX)
            self.activation = T.switch(T.eq(is_train, 1), h * mask, h * (1 - p)) # is_train = 1
        else:
            self.activation = T.switch(T.eq(is_train, 1), h, h) # is_train

        # TODO ->scan
        def _derive(prop, r, post_r, z, gh, pre_h, post_dh, post_dgh, post_dr, post_dz):
            dh = prop + T.dot(post_dr, self.W_hr.T) + T.dot(post_dz, self.W_hz.T) + T.dot(post_dgh * post_r, self.W_hh.T) + post_dh * z
            dgh = dh * (1 - z) * (1 - gh ** 2)
            dr = T.dot(dgh * pre_h, self.W_hh.T) * ((1 - r) * r)
            dz = (dh * (pre_h - gh)) * ((1 - z) * z)
            return dh, dgh, dr, dz
        prop, r, z, gh, pre_h, post_dh, post_dgh, post_dr, post_dz, post_r = \
                T.matrices("prop", "r", "z", "gh", "pre_h", "post_dh", "post_dgh", "post_dr", "post_dz", "post_r")
        self.derive = theano.function(
            inputs = [prop, r, post_r, z, gh, pre_h, post_dh, post_dgh, post_dr, post_dz],
            outputs = _derive(prop, r, post_r, z, gh, pre_h, post_dh, post_dgh, post_dr, post_dz)
        )

        x, dz, dr, dgh = T.rows("x", "dz", "dr", "dgh")
        updates_grad = [(self.gW_xr, self.gW_xr + T.dot(x.T, dr)),
               (self.gW_xz, self.gW_xz + T.dot(x.T, dz)),
               (self.gW_xh, self.gW_xh + T.dot(x.T, dgh)),
               (self.gW_hr, self.gW_hr + T.dot(pre_h.T, dr)),
               (self.gW_hz, self.gW_hz + T.dot(pre_h.T, dz)),
               (self.gW_hh, self.gW_hh + T.dot((r * pre_h).T, dgh)),
               (self.gb_r, self.gb_r + dr),
               (self.gb_z, self.gb_z + dz),
               (self.gb_h, self.gb_h + dgh)]
        self.grad = theano.function(
            inputs = [x, r, pre_h, dz, dr, dgh],
            updates = updates_grad
        )

        updates_clear = [
               (self.gW_xr, self.gW_xr * 0),
               (self.gW_xz, self.gW_xz * 0),
               (self.gW_xh, self.gW_xh * 0),
               (self.gW_hr, self.gW_hr * 0),
               (self.gW_hz, self.gW_hz * 0),
               (self.gW_hh, self.gW_hh * 0),
               (self.gb_r, self.gb_r * 0),
               (self.gb_z, self.gb_z * 0),
               (self.gb_h, self.gb_h * 0)]
        self.clear_grad = theano.function(
            inputs = [],
            updates = updates_clear
        )

        lr = T.scalar()
        t = T.scalar()
        tm1 = T.scalar()
        updates_w = [
               (self.W_xr, self.W_xr - self.gW_xr * lr / t),
               (self.W_xz, self.W_xz - self.gW_xz * lr / t),
               (self.W_xh, self.W_xh - self.gW_xh * lr / t),
               (self.W_hr, self.W_hr - self.gW_hr * lr / tm1),
               (self.W_hz, self.W_hz - self.gW_hz * lr / tm1),
               (self.W_hh, self.W_hh - self.gW_hh * lr / tm1),
               (self.b_r, self.b_r - self.gb_r * lr / t),
               (self.b_z, self.b_z - self.gb_z * lr / t),
               (self.b_h, self.b_h - self.gb_h * lr / t)]

        self.update = theano.function(
            inputs = [lr, t, tm1],
            updates = updates_w
        )

        DZ, DR, DGH = T.matrices("DZ", "DR", "DGH")
        def _propagate(DR, DZ, DGH):
            return (T.dot(DR, self.W_xr.T) + T.dot(DZ, self.W_xz.T) + T.dot(DGH, self.W_xh.T))
        self.propagate = theano.function(inputs = [DR, DZ, DGH], outputs = _propagate(DR, DZ, DGH))

        self.params = [self.W_xr, self.W_hr, self.b_r,
                       self.W_xz, self.W_hz, self.b_z,
                       self.W_xh, self.W_hh, self.b_h]