Example #1
0
 def grad_y(self, Y):
     #ANS = 0*g.zeros((len(X),self.input_size)) #X undefined?
     ANS = 0 * g.zeros((len(Y), self.input_size))  #X undefined?
     for (f, (ir0, ir1), (or0, or1)) in zip(self.fns, self.input_ranges,
                                            self.output_ranges):
         ANS[:, ir0:ir1] = f.grad_y(Y[:, ir0:ir1])
     return ANS
Example #2
0
    def __init__(self, *args):
        assert len(args) % 2 == 0
        fns, sizes, weights = args[::3], args[1::3], args[2::3]

        from ersatz.mrnn.pylab import array
        input_sizes = array([x for (x, y) in sizes])
        output_sizes = array([y for (x, y) in sizes])

        self.fns = fns
        self.input_sizes = input_sizes
        self.output_sizes = output_sizes
        self.weights = array(weights)

        self.input_ranges = []
        ptr = 0
        for s in input_sizes:
            a = ptr
            b = ptr + s
            self.input_ranges.append((a, b))
            ptr = b
        assert ptr == input_sizes.sum()

        self.output_ranges = []
        ptr = 0
        for s in output_sizes:
            a = ptr
            b = ptr + s
            self.output_ranges.append((a, b))
            ptr = b
        assert ptr == output_sizes.sum()

        self.input_size = input_sizes.sum()
        self.output_size = output_sizes.sum()

        self.loss_acc = g.zeros(len(fns))
Example #3
0
    def __init__(
            self,
            # give it what it needs: sizes etc
            v,
            h,
            f,
            o,
            hid_nonlin,  # the nonlinearities
            out_nonlin,  # the output nonlinearity
            struct_damp_nonlin=nonlin.
        Tanh,  # the structural damping nonlinearity
            init=True,
            number_of_timesteps_to_use=99999999):
        """
        v int number of features
        h int number of hidden units
        f int number of factored units
        o int
        hid_nonlin function hidden layer nonlinearity function
        out_nonlin function output nonlinearity function
        struct_damp_nonlin function
        init bool whether to initialize arrays to random data
        """
        self.v = v
        self.h = h
        self.f = f
        self.o = o

        self.hid_nonlin = hid_nonlin
        self.out_nonlin = out_nonlin
        self.struct_damp_nonlin = struct_damp_nonlin

        self.number_of_timesteps_to_use = number_of_timesteps_to_use

        if init:
            self.h_init = g.randn(1, h)
            self.W_hf = g.randn(h, f)
            self.W_fh = g.randn(f, h)
            #self.W_hh = g.randn(h, h)
            self.f_bias = g.zeros((1, f))

            self.W_vh = g.randn(v, h)
            self.W_vf = g.randn(v, f)
            self.W_ho = g.randn(h, o)

        if hid_nonlin is None:
            hid_nonlin = nonlin.Tanh
        self.hid_nonlin = hid_nonlin

        if out_nonlin is None:
            out_nonlin = nonlin.Lin

        self.out_nonlin = out_nonlin
Example #4
0
    def __call__(self, X):
        b, y = X.shape
        assert y % 2 == 0

        b = X[:, :y / 2]
        a_ = X[:, y / 2:]
        a = self.f(a_)

        ans = g.zeros(X.shape)
        ans[:, :y / 2] = b / a
        ans[:, y / 2:] = a

        return ans
Example #5
0
    def grad(self, XP, O, M):
        b, y = XP.shape
        assert y % 2 == 0

        b = XP[:, :y / 2]
        a_ = XP[:, y / 2:]
        a = self.f(a_)
        a_d = self.f_prime_y(a)

        G = g.zeros(XP.shape)

        m = b / a  # that's good
        G[:, :y / 2] = -(O - m)

        ## try the A gradients: right.
        s = 1. / a

        E_OO = s + m * m
        ## that's good oto.
        G[:, y / 2:] = (O * O - E_OO) * 0.5 * a_d

        return G * M
Example #6
0
    def __call__(self, x):
        """
        Return batch number x
        x: int
        return: V,O,P arrays
        """

        if x < 0:
            mode = 'test'
        else:
            mode = 'train'
        batch_id = x

        batch = self.create_batch(mode, batch_id)

        T = batch.shape[0]
        Vs = []
        Os = []
        Ms = []
        for t, timestep in enumerate(batch):
            num_features = timestep.shape[1]
            timestep = timestep[~np.isnan(timestep)].reshape(
                (-1, num_features))
            if timestep.size == 0:
                # if timestep contains only nans, then we at the end of
                # samples and in this batch no one sample has this
                # timestep, stop processing next timesteps
                break
            batch_size = timestep.shape[0]
            v = timestep[:, :-self.O]
            o = timestep[:, -self.O:]
            if mode == 'train' and t < T - self.true_T:
                m = g.zeros((batch_size, 1))
            else:
                m = g.ones((batch_size, 1))
            Vs.append(g.garray(v))
            Os.append(g.garray(o))
            Ms.append(m)
        return Vs, Os, Ms
Example #7
0
    def H_prod(self, X, P, M=1, H_damping=1):
        b, y = P.shape
        assert y % 2 == 0

        m = P[:, :y / 2]  # m is first.
        a = P[:, y / 2:]
        a_d = self.f_prime_y(a)

        s = 1 / a

        R_b = X[:, :y / 2]
        R_a_ = X[:, y / 2:]

        A_00 = (s)
        A_01 = A_10 = (-s * m) * a_d
        A_11 = (s * m**2 + .5 * s**2) * a_d**2

        ANS = g.zeros(P.shape)
        ANS[:, :y / 2] = R_b * A_00 + R_a_ * A_01
        ANS[:,
            y / 2:] = R_b * A_10 + R_a_ * (A_11 +
                                           max(H_damping, self.min_H_damping))

        return ANS * M
Example #8
0
    def backward_pass(self,
                      state,
                      dOX,
                      R_state=None,
                      mu=0.,
                      compute_grad2=False):
        # backprop.

        if R_state is None:
            R_HX, R_OX = None, None
        else:
            R_HX, R_OX = R_state

        V, A, B, H, OX = state
        if V[0] is not None:
            V = [None] + V
#         if A[0] is not None:
#             A = [None] + A
#         if B[0] is not None:
#             B = [None] + B
#         if H[0] is not None:
#             H = [None] + H
        if OX[0] is not None:
            OX = [None] + OX
        if dOX[0] is not None:
            dOX = [None] + dOX

        T = len(V) - 1

        grad = self.unpack(self.pack() * 0)
        if compute_grad2:
            grad2 = self.unpack(self.pack() * 0)
        else:
            grad2 = None

        dH_1t = g.zeros(H[T].shape)
        prev_batch_size = H[T].shape[0]
        for t in reversed(range(1, T + 1)):

            dH_t = g.dot(dOX[t], self.W_ho.T)
            dH_t[:prev_batch_size, :] += dH_1t

            grad.W_ho += g.dot(H[t].T, dOX[t])
            if compute_grad2:
                grad2.W_ho += g.dot((H[t] * H[t]).T, dOX[t] * dOX[t])

            ## backpropagate the nonlinearity: at this point, dHX_t, the gradinet
            ## wrt the total inputs to H_t, is correct.
            dHX_t = dH_t * self.hid_nonlin.grad_y(H[t])

            ## Add the structured reg at this point: That's good.
            if R_HX is not None:
                dHX_t += float(mu) * self.struct_damp_nonlin.H_prod(
                    R_HX[t], H[t], M=1)

            ## had hh grad here: (H[t-1], dHX_t)

            B_t_f = (B[t] + self.f_bias)
            AB = A[t] * B_t_f

            grad.W_fh += g.dot(AB.T, dHX_t)
            grad.W_vh += g.dot(V[t].T, dHX_t)
            if compute_grad2:
                _dHX2 = dHX_t * dHX_t
                grad2.W_fh += g.dot((AB * AB).T, _dHX2)
                grad2.W_vh += g.dot((V[t] * V[t]).T, _dHX2)

            ## do the intermediate backprop:
            dAB = g.dot(dHX_t, self.W_fh.T)

            dB = dAB * A[t]

            grad.f_bias += dB.sum(0)
            dBB = dB * (1 - B[t] * B[t])
            grad.W_vf += g.dot(V[t].T, dBB)

            dA = dAB * B_t_f
            Ht_minus_one = H[t - 1][:dA.shape[0], :]
            grad.W_hf += g.dot(Ht_minus_one.T, dA)

            if compute_grad2:
                grad2.f_bias += (dB * dB).sum(0)
                grad2.W_vf += g.dot((V[t] * V[t]).T, dBB * dBB)
                grad2.W_hf += g.dot((Ht_minus_one * Ht_minus_one).T, dA * dA)

            dH_1t = g.dot(dA, self.W_hf.T)
            prev_batch_size = V[t].shape[0]

        grad.h_init += dH_1t.sum(0)
        if compute_grad2:
            grad2.h_init += (dH_1t * dH_1t).sum(0)

        return grad, grad2