Exemple #1
0
    def test_multinomial_vector(self):
        random = RandomStreams(utt.fetch_seed())
        n = tensor.lvector()
        pvals = tensor.matrix()
        out = random.multinomial(n=n, pvals=pvals)
        assert out.ndim == 2
        f = function([n, pvals], out)

        n_val = [1, 2, 3]
        pvals_val = [[.1, .9], [.2, .8], [.3, .7]]
        pvals_val = numpy.asarray(pvals_val, dtype=config.floatX)
        seed_gen = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))

        # Arguments of size (3,)
        val0 = f(n_val, pvals_val)
        numpy_val0 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val, pvals_val)])
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(n_val[:-1], pvals_val[:-1])
        numpy_val1 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val[:-1], pvals_val[:-1])])
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = function([n, pvals], random.multinomial(n=n, pvals=pvals, size=(3,)))
        val2 = g(n_val, pvals_val)
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))
        numpy_val2 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val, pvals_val)])
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, n_val[:-1], pvals_val[:-1])
    def test_multinomial_vector(self):
        random = RandomStreams(utt.fetch_seed())
        n = tensor.lvector()
        pvals = tensor.matrix()
        out = random.multinomial(n=n, pvals=pvals)
        assert out.ndim == 2
        f = function([n, pvals], out)

        n_val = [1, 2, 3]
        pvals_val = [[.1, .9], [.2, .8], [.3, .7]]
        pvals_val = numpy.asarray(pvals_val, dtype=config.floatX)
        seed_gen = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))

        # Arguments of size (3,)
        val0 = f(n_val, pvals_val)
        numpy_val0 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val, pvals_val)])
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(n_val[:-1], pvals_val[:-1])
        numpy_val1 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val[:-1], pvals_val[:-1])])
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = function([n, pvals], random.multinomial(n=n, pvals=pvals, size=(3,)))
        val2 = g(n_val, pvals_val)
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))
        numpy_val2 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val, pvals_val)])
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, n_val[:-1], pvals_val[:-1])
Exemple #3
0
class SampleMultinomial(Layer):
    def __init__(self, from_logits=False, **kwargs):
        super(SampleMultinomial, self).__init__(**kwargs)
        self.from_logits = from_logits
        if K.backend() == 'theano':
            from theano.tensor.shared_randomstreams import RandomStreams
            self.random = RandomStreams()
        elif K.backend() == 'tensorflow':
            import tensorflow as tf
        else:
            raise NotImplementedError

    def call(self, x, mask=None):
        if K.backend() == 'theano':
            if self.from_logits:
                # TODO: there is a more direct way from logits
                return K.argmax(self.random.multinomial(pvals=K.softmax(x)))
            else:
                return K.argmax(self.random.multinomial(pvals=x))
        elif K.backend() == 'tensorflow':
            import tensorflow as tf
            shape = K.shape(x)
            if not self.from_logits:
                x = tf.clip_by_value(x, K.epsilon(), 1 - K.epsilon())
                x = tf.log(x)
            return K.reshape(tf.multinomial(K.reshape(x, [-1, shape[-1]]), 1), shape[:-1])
        else:
            raise NotImplementedError

    def compute_output_shape(self, input_shape):
        return input_shape[:-1]
Exemple #4
0
    def prediction(self, h, bias):
        srng = RandomStreams(seed=42)

        prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \
            self.compute_parameters(h, bias)

        mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1)

        v = T.arange(0, mean_x.shape[0])
        m_x = mean_x[v, mode]
        m_y = mean_y[v, mode]
        s_x = std_x[v, mode]
        s_y = std_y[v, mode]
        r = rho[v, mode]
        # cov = r * (s_x * s_y)

        normal = srng.normal((h.shape[0], 2))
        x = normal[:, 0]
        y = normal[:, 1]

        # x_n = T.shape_padright(s_x * x + cov * y + m_x)
        # y_n = T.shape_padright(s_y * y + cov * x + m_y)

        x_n = T.shape_padright(m_x + s_x * x)
        y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1. - r**2)))

        uniform = srng.uniform((h.shape[0], ))
        pin = T.shape_padright(T.cast(bernoulli > uniform, floatX))

        return T.concatenate([x_n, y_n, pin], axis=1)
Exemple #5
0
    def prediction(self, h, bias):
        srng = RandomStreams(seed=42)

        prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \
            self.compute_parameters(h, bias)

        mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1)

        v = T.arange(0, mean_x.shape[0])
        m_x = mean_x[v, mode]
        m_y = mean_y[v, mode]
        s_x = std_x[v, mode]
        s_y = std_y[v, mode]
        r = rho[v, mode]
        # cov = r * (s_x * s_y)

        normal = srng.normal((h.shape[0], 2))
        x = normal[:, 0]
        y = normal[:, 1]

        # x_n = T.shape_padright(s_x * x + cov * y + m_x)
        # y_n = T.shape_padright(s_y * y + cov * x + m_y)

        x_n = T.shape_padright(m_x + s_x * x)
        y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1.-r**2)))

        uniform = srng.uniform((h.shape[0],))
        pin = T.shape_padright(T.cast(bernoulli > uniform, floatX))

        return T.concatenate([x_n, y_n, pin], axis=1)
Exemple #6
0
def sparse_sample_multinomial(x, from_logits=False):
    if K.backend() == 'theano':
        from theano.tensor.shared_randomstreams import RandomStreams
        random = RandomStreams()
        if from_logits:
            # TODO: there is a more direct way from logits
            return K.argmax(random.multinomial(pvals=K.softmax(x)))
        else:
            return K.argmax(random.multinomial(pvals=x))
    elif K.backend() == 'tensorflow':
        import tensorflow as tf
        shape = K.shape(x)
        if not from_logits:
            x = tf.clip_by_value(x, K.epsilon(), 1 - K.epsilon())
            x = tf.log(x)
        return K.reshape(tf.multinomial(K.reshape(x, (-1, shape[-1])), 1), shape[:-1])
    else:
        raise NotImplementedError
Exemple #7
0
def MDN_output_layer(x, h, y, in_size, out_size, hidden_size, pred):
    if connect_h_to_o:
        hiddens = T.concatenate([hidden for hidden in h], axis=2)
        hidden_out_size = hidden_size * len(h)
    else:
        hiddens = h[-1]
        hidden_out_size = hidden_size

    mu_linear = Linear(name='mu_linear' + str(pred),
                       input_dim=hidden_out_size,
                       output_dim=out_size * components_size[network_mode])
    sigma_linear = Linear(name='sigma_linear' + str(pred),
                          input_dim=hidden_out_size,
                          output_dim=components_size[network_mode])
    mixing_linear = Linear(name='mixing_linear' + str(pred),
                           input_dim=hidden_out_size,
                           output_dim=components_size[network_mode])
    initialize([mu_linear, sigma_linear, mixing_linear])

    mu = mu_linear.apply(hiddens)
    mu = mu.reshape(
        (mu.shape[0], mu.shape[1], out_size, components_size[network_mode]))

    sigma_orig = sigma_linear.apply(hiddens)
    sigma = T.nnet.softplus(sigma_orig)

    mixing_orig = mixing_linear.apply(hiddens)
    e_x = T.exp(mixing_orig - mixing_orig.max(axis=2, keepdims=True))
    mixing = e_x / e_x.sum(axis=2, keepdims=True)

    exponent = -0.5 * T.inv(sigma) * T.sum(
        (y.dimshuffle(0, 1, 2, 'x') - mu)**2, axis=2)
    normalizer = (2 * np.pi * sigma)
    exponent = exponent + T.log(mixing) - (out_size * .5) * T.log(normalizer)

    # LogSumExp(x)
    max_exponent = T.max(exponent, axis=2, keepdims=True)
    mod_exponent = exponent - max_exponent
    gauss_mix = T.sum(T.exp(mod_exponent), axis=2, keepdims=True)
    log_gauss = T.log(gauss_mix) + max_exponent
    cost = -T.mean(log_gauss)

    srng = RandomStreams(seed=seed)
    mixing = mixing_orig * (1 + sampling_bias)
    sigma = T.nnet.softplus(sigma_orig - sampling_bias)
    e_x = T.exp(mixing - mixing.max(axis=2, keepdims=True))
    mixing = e_x / e_x.sum(axis=2, keepdims=True)
    component = srng.multinomial(pvals=mixing)
    component_mean = T.sum(mu * component.dimshuffle(0, 1, 'x', 2), axis=3)
    component_std = T.sum(sigma * component, axis=2, keepdims=True)
    linear_output = srng.normal(avg=component_mean, std=component_std)
    linear_output.name = 'linear_output'

    return linear_output, cost
Exemple #8
0
    def score(self, Y, Y_hat):
        # TODO fix me later when using IndexSpace

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        state_below, = owner.inputs
        assert state_below.ndim == 2

        # TODO make this more generic like above
        state_below = state_below.owner.inputs[0].owner.inputs[0]

        Y = T.argmax(Y, axis = 1)
        k = self.num_noise_samples

        if self.noise_prob is None:
            theano_rng = RandomStreams(seed = self.mlp.rng.randint(2 ** 15))
            noise = theano_rng.random_integers(size = (state_below.shape[0], self.num_noise_samples,), low=0, high = self.n_classes - 1)
            p_n = 1. / self.n_classes
            p_w = T.nnet.sigmoid((state_below * self.W[:, Y].T).sum(axis=1) + self.b[Y])
            p_x = T.nnet.sigmoid((T.concatenate([state_below] * k) * self.W[:, noise.flatten()].T).sum(axis=1) + self.b[noise.flatten()])
            # TODO is this reshape necessary?
            p_x = p_x.reshape((state_below.shape[0], k))

            #pos = k * p_n / (p_w + k * p_n) * T.log(p_w)
            #neg = (p_x / (p_x + k * p_n) * T.log(p_x)).sum(axis=1)
        else:
            #import ipdb
            #ipdb.set_trace()
            theano_rng = MRG_RandomStreams(max(self.mlp.rng.randint(2 ** 15), 1))
            assert self.mlp.batch_size is not None
            noise = theano_rng.multinomial(pvals = np.tile(self.noise_prob.get_value(), (k * self.mlp.batch_size, 1)))
            noise = T.argmax(noise, axis = 1)
            p_n = self.noise_prob
            p_w = T.nnet.sigmoid((state_below * self.W[:, Y].T).sum(axis=1) + self.b[Y])
            p_x = T.nnet.sigmoid((T.concatenate([state_below] * k) * self.W[:, noise.flatten()].T).sum(axis=1) + self.b[noise.flatten()])
            p_x = p_x.reshape((state_below.shape[0], k))

            pos = k * p_n[Y] / (p_w + k * p_n[Y]) * T.log(p_w)
            neg = (p_x / (p_x + k * p_n[noise].reshape(p_x.shape)) * T.log(p_x)).sum(axis=1)


        #return -(pos - neg).mean()
        return p_w, p_x
Exemple #9
0
    def test_multinomial(self):
        """Test that RandomStreams.multinomial generates the same results as numpy"""
        # Check over two calls to see if the random state is correctly updated.
        random = RandomStreams(utt.fetch_seed())
        fn = function([], random.multinomial((4,4), 1, [0.1]*10), updates=random.updates())

        fn_val0 = fn()
        fn_val1 = fn()

        rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30)
        rng = numpy.random.RandomState(int(rng_seed)) #int() is for 32bit
        numpy_val0 = rng.multinomial(1, [0.1]*10, size=(4,4))
        numpy_val1 = rng.multinomial(1, [0.1]*10, size=(4,4))

        assert numpy.all(fn_val0 == numpy_val0)
        assert numpy.all(fn_val1 == numpy_val1)
Exemple #10
0
    def prediction(self, h, bias):
        srng = RandomStreams(seed=42)

        prop, mean, std = self.compute_parameters(h, bias)

        mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1)

        bs = mean.shape[0]
        v = T.arange(0, bs)
        m = mean[v, mode]  # (bs, d)
        s = std[v, mode]  # (bs, d)

        normal = srng.normal((bs, self.n_dim))  # (bs, d)
        normal_n = m + s * normal

        return normal_n
    def test_multinomial(self):
        """Test that RandomStreams.multinomial generates the same results as numpy"""
        # Check over two calls to see if the random state is correctly updated.
        random = RandomStreams(utt.fetch_seed())
        fn = function([], random.multinomial((4, 4), 1, [0.1]*10), updates=random.updates())

        fn_val0 = fn()
        fn_val1 = fn()

        rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30)
        rng = numpy.random.RandomState(int(rng_seed))  # int() is for 32bit
        numpy_val0 = rng.multinomial(1, [0.1]*10, size=(4, 4))
        numpy_val1 = rng.multinomial(1, [0.1]*10, size=(4, 4))

        assert numpy.all(fn_val0 == numpy_val0)
        assert numpy.all(fn_val1 == numpy_val1)
Exemple #12
0
def direct_method(var, parm, p, S, seed=233):
    S = np.asarray(S)

    reps = var[0].shape[0]

    rng = RandomStreams(seed=seed)  #todo: makes sure it runs on GPU
    r_u = rng.uniform((1, reps))

    p_n = sum(p)
    prob = [p_i / p_n for p_i in p]

    v = tt.stack(prob).reshape(
        (len(p),
         reps)).T  #This happens because of variables as dvectors (rxns, reps)

    #TODO: check prb_f for nans
    prb_f = theano.function(var + parm, v)

    tau_f = theano.function(var + parm, (1 / p_n) * tt.log(1 / r_u))

    ran_f = theano.function(var + parm, rng.multinomial(n=1, pvals=v))

    def compute(ics, parm, interval):
        x = ics
        reps = ics[0].shape[0]
        time = np.zeros(shape=(1, reps))
        out = np.zeros(shape=(reps, interval,
                              len(var) + 1))  #(rep, time, vars+time)

        for idx in range(interval):

            args = [x[0], x[1]] + parm

            time += tau_f(*args)

            ran_i = ran_f(*args)
            incr = np.dot(ran_i, S).T

            x = np.asarray(x) + incr
            out[:, idx, :] = np.concatenate((time, x)).T

        return out

    return compute
    def test_default_shape(self):
        random = RandomStreams(utt.fetch_seed())
        f = function([], random.uniform())
        g = function([], random.multinomial())

        # seed_rng is generator for generating *seeds* for RandomStates
        seed_rng = numpy.random.RandomState(utt.fetch_seed())
        uniform_rng = numpy.random.RandomState(int(seed_rng.randint(2**30)))
        multinomial_rng = numpy.random.RandomState(int(seed_rng.randint(2**30)))

        val0 = f()
        val1 = f()
        numpy_val0 = uniform_rng.uniform()
        numpy_val1 = uniform_rng.uniform()
        assert numpy.allclose(val0, numpy_val0)
        assert numpy.allclose(val1, numpy_val1)

        for i in range(10):  # every test has 50% chance of passing even with non-matching random states
            val2 = g()
            numpy_val2 = multinomial_rng.multinomial(n=1, pvals=[.5, .5])
            assert numpy.all(val2 == numpy_val2)
Exemple #14
0
    def sample(self, param_dict):
        p_vals = param_dict['p_vals']
        if K.backend() == 'tensorflow':
            import tensorflow as tf

            shape = K.shape(p_vals)
            reshaped_params = K.reshape(p_vals, (-1, self.n_classes))
            samples = tf.multinomial(logits=tf.log(reshaped_params),
                                     num_samples=1)[:, 0]
            # a hack to turn it into one-hot
            onehot = tf.constant(np.eye(self.n_classes, dtype=np.float32))
            result = tf.nn.embedding_lookup(onehot, samples)
            result = K.reshape(result, shape)
            return result
        else:
            from theano.tensor.shared_randomstreams import RandomStreams
            random = RandomStreams()
            return random.multinomial(size=K.shape(p_vals)[:-1],
                                      n=1,
                                      pvals=p_vals,
                                      dtype='float32')
Exemple #15
0
    def test_default_shape(self):
        random = RandomStreams(utt.fetch_seed())
        f = function([], random.uniform())
        g = function([], random.multinomial())

        #seed_rng is generator for generating *seeds* for RandomStates
        seed_rng = numpy.random.RandomState(utt.fetch_seed())
        uniform_rng = numpy.random.RandomState(int(seed_rng.randint(2**30)))
        multinomial_rng = numpy.random.RandomState(int(seed_rng.randint(2**30)))

        val0 = f()
        val1 = f()
        numpy_val0 = uniform_rng.uniform()
        numpy_val1 = uniform_rng.uniform()
        assert numpy.allclose(val0, numpy_val0)
        assert numpy.allclose(val1, numpy_val1)

        for i in range(10): # every test has 50% chance of passing even with non-matching random states
            val2 = g()
            numpy_val2 = multinomial_rng.multinomial(n=1, pvals=[.5, .5])
            assert numpy.all(val2 == numpy_val2)
Exemple #16
0
class SRNN(Model):
    def __init__(
        self,
        name,  # a string for identifying model.
        numvis,
        numhid,
        numframes,
        output_type="real",
        cheating_level=0.0,  # cheating by lookig at x_t (instead of x_tm1)
        numpy_rng=None,
        theano_rng=None,
    ):
        super(SRNN, self).__init__(name=name)

        # store arguments
        self.numvis = numvis
        self.numhid = numhid
        self.numlayers = 2
        self.numframes = numframes
        self.output_type = output_type
        self.selectionthreshold = 0.0
        self.cheating_level = theano.shared(np.float32(cheating_level))

        if not numpy_rng:
            self.numpy_rng = np.random.RandomState(1)
        else:
            self.numpy_rng = numpy_rng
        if not theano_rng:
            self.theano_rng = RandomStreams(1)
        else:
            self.theano_rng = theano_rng

        # create input var
        self.inputs = T.matrix(name="inputs")

        # set up params
        self.whh = [
            theano.shared(value=np.eye(self.numhid).astype(theano.config.floatX), name="whh0"),
            theano.shared(value=np.eye(self.numhid).astype(theano.config.floatX), name="whh1"),
        ]
        self.whx = [
            theano.shared(
                value=self.numpy_rng.uniform(low=-0.01, high=0.01, size=(self.numhid, self.numvis)).astype(
                    theano.config.floatX
                ),
                name="whx0",
            ),
            theano.shared(
                value=self.numpy_rng.uniform(low=-0.01, high=0.01, size=(self.numhid, self.numvis)).astype(
                    theano.config.floatX
                ),
                name="whx1",
            ),
        ]
        self.wxh = [
            theano.shared(
                value=self.numpy_rng.uniform(low=-0.01, high=0.01, size=(self.numvis, self.numhid)).astype(
                    theano.config.floatX
                ),
                name="wxh0",
            ),
            theano.shared(
                value=self.numpy_rng.uniform(low=-0.01, high=0.01, size=(self.numhid, self.numhid)).astype(
                    theano.config.floatX
                ),
                name="wxh1",
            ),
        ]
        self.bhid = [
            theano.shared(value=np.zeros(self.numhid, dtype=theano.config.floatX), name="bhid0"),
            theano.shared(value=np.zeros(self.numhid, dtype=theano.config.floatX), name="bhid1"),
        ]
        self.bx = theano.shared(value=np.zeros(self.numvis, dtype=theano.config.floatX), name="bx")
        self.params = self.whh + self.whx + self.wxh + self.bhid + [self.bx]

        self._batchsize = self.inputs.shape[0]

        # reshape input var from 2D [ Bx(NxT) ] to 3D [ TxBxN ] (time, batch, numvis)
        self._input_frames = self.inputs.reshape(
            (self._batchsize, self.inputs.shape[1] // self.numvis, self.numvis)
        ).transpose(1, 0, 2)

        # one-step prediction, used by sampling function
        self.hids_t0 = [T.zeros((self._batchsize, self.numhid)), T.zeros((self._batchsize, self.numhid))]
        self.hids_t1 = [
            ReLU(T.dot(self.hids_t0[0], self.whh[0]) + T.dot(self._input_frames[0], self.wxh[0]) + self.bhid[0])
        ]
        self.hids_t1.append(
            ReLU(T.dot(self.hids_t0[1], self.whh[1]) + T.dot(self.hids_t1[-1], self.wxh[1]) + self.bhid[1])
        )

        self.x_pred_1 = self.bx
        for k in range(2):
            self.x_pred_1 += T.dot(self.hids_t1[k], self.whx[k])
        # end of one-step prediction

        def step(x_tm1, hids_tm1):
            hids_tm1 = [hids_tm1[:, k * self.numhid : (k + 1) * self.numhid] for k in range(2)]
            hids_t = [ReLU(T.dot(hids_tm1[0], self.whh[0]) + T.dot(x_tm1, self.wxh[0]) + self.bhid[0])]
            hids_t.append(ReLU(T.dot(hids_tm1[1], self.whh[1]) + T.dot(hids_t[-1], self.wxh[1]) + self.bhid[1]))

            x_pred_t = self.bx
            for k in range(2):
                x_pred_t += T.dot(hids_t[k], self.whx[k])
            return x_pred_t, T.concatenate(hids_t, 1)

        (self._predictions, self.hids), self.updates = theano.scan(
            fn=step, sequences=self._input_frames, outputs_info=[None, T.concatenate(self.hids_t0, 1)]
        )

        # set up output prediction
        if self.output_type == "real":
            self._prediction = self._predictions[:, :, : self.numvis]
        elif self.output_type == "binary":
            self._prediction = sigmoid(self._predictions[:, :, : self.numvis])
        elif self.output_type == "softmax":
            # softmax doesn't support 3d tensors, reshape batch and time axis
            # together, apply softmax and reshape back to 3d tensor
            self._prediction = T.nnet.softmax(
                self._predictions[:, :, : self.numvis].reshape(
                    (self._predictions.shape[0] * self._predictions.shape[1], self.numvis)
                )
            ).reshape((self._predictions.shape[0], self._predictions.shape[1], self.numvis))
        else:
            raise ValueError("unsupported output_type")

        # set cost
        self._prediction_for_training = self._prediction[: self.numframes - 1]
        if self.output_type == "real":
            self._cost = T.mean((self._prediction_for_training - self._input_frames[1 : self.numframes]) ** 2)
            self._cost_varlen = T.mean((self._prediction - self._input_frames[1:]) ** 2)
        elif self.output_type == "binary":
            self._cost = -T.mean(
                self._input_frames[1 : self.numframes] * T.log(self._prediction_for_training)
                + (1.0 - self._input_frames[4 : self.numframes]) * T.log(1.0 - self._prediction)
            )
            self._cost_varlen = -T.mean(
                self._input_frames[1:] * T.log(self._prediction_for_training)
                + (1.0 - self._input_frames[1:]) * T.log(1.0 - self._prediction)
            )
        elif self.output_type == "softmax":
            self._cost = -T.mean(T.log(self._prediction_for_training) * self._input_frames[1 : self.numframes])
            self._cost_varlen = -T.mean(T.log(self._prediction) * self._input_frames[1:])

        # set gradients
        self._grads = T.grad(self._cost, self.params)

        # theano function for computing cost and grad
        self.cost = theano.function([self.inputs], self._cost, updates=self.updates)
        self.grads = theano.function([self.inputs], self._grads, updates=self.updates)

        # another set of variables
        # give some time steps of characters and free the model to predict for all the rest.
        self.inputs_var = T.fmatrix("inputs_var")
        self.nsteps = T.lscalar("nsteps")
        givens = {}
        givens[self.inputs] = T.concatenate(
            (self.inputs_var[:, : self.numvis], T.zeros((self.inputs_var.shape[0], self.nsteps * self.numvis))), axis=1
        )

        self.predict = theano.function(
            [self.inputs_var, theano.Param(self.nsteps, default=self.numframes - 4)],
            self._prediction.transpose(1, 0, 2).reshape((self.inputs_var.shape[0], self.nsteps * self.numvis)),
            updates=self.updates,
            givens=givens,
        )

    def grad(self, x):
        def get_cudandarray_value(x):
            if type(x) == theano.sandbox.cuda.CudaNdarray:
                return np.array(x.__array__()).flatten()
            else:
                return x.flatten()

        return np.concatenate([get_cudandarray_value(g) for g in self.grads(x)])

    def sample(self, numcases=1, numframes=10, temperature=1.0):
        assert self.output_type == "softmax"
        next_prediction_and_state = theano.function(
            [self._input_frames, self.hids_0],
            [self.theano_rng.multinomial(pvals=T.nnet.softmax(self.x_pred_1 / temperature)), self.hids_1],
        )
        preds = np.zeros((numcases, numframes, self.numvis), dtype="float32")
        preds[:, 0, :] = self.numpy_rng.multinomial(numcases, pvals=np.ones(self.numvis) / np.float(self.numvis))
        hids = np.zeros((numcases, self.numhid), dtype="float32")
        for t in range(1, numframes):
            nextpredandstate = next_prediction_and_state(preds[:, [t - 1], :], hids)
            hids = nextpredandstate[1]
            preds[:, t, :] = nextpredandstate[0]
        return preds
class MDN(object):
    """Mixture Density Network
    """

    def __init__(self, input, rng, n_in, n_hiddens, hid_activations, n_out, out_activation, n_components):
        """Initialize the parameters for the multilayer perceptron

        :type rng: np.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden_list: list of int
        :param n_hidden_list: a list of number of units in each hidden layer

        :type activations_list: list of lambdas
        :param n_hidden_list: a list of activations used in each hidden layer
        
        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        from theano.tensor.shared_randomstreams import RandomStreams

        self.srng = RandomStreams(seed=1234)

        self.input = input

        # We are dealing with multiple hidden layers MLP
        layer0 = NetworkLayer(rng=rng, input=input, n_in=n_in, n_out=n_hiddens[0], activation=hid_activations[0])

        h_layers = [("hiddenLayer0", layer0)]

        for i in range(1, len(n_hiddens)):
            h_layers.append(
                (
                    "hiddenLayer%d" % i,
                    NetworkLayer(
                        rng=rng,
                        input=h_layers[i - 1][1].output,
                        n_in=n_hiddens[i - 1],
                        n_out=n_hiddens[i],
                        activation=hid_activations[i],
                    ),
                )
            )

        self.__dict__.update(dict(h_layers))

        # The output layer gets as input the hidden units
        # of the hidden layer
        self.outputLayer = MDNoutputLayer(
            rng=rng, input=h_layers[-1][1].output, n_in=n_hiddens[-1], n_out=n_out, n_components=n_components
        )

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = (self.outputLayer.W_sigma ** 2).sum() + (self.outputLayer.W_mixing ** 2).sum()

        for i in range(len(n_hiddens)):
            self.L2_sqr += (self.__dict__["hiddenLayer%d" % i].W ** 2).sum()

        # the parameters of the model are the parameters of the all layers it
        # is made out of
        params = self.outputLayer.params
        for layer in h_layers:
            params.extend(layer[1].params)
        self.params = params

    def set_symbolic_input(self, input):
        """We use this function to bind a symbolic variable with the input
        of the network layer. Added to specify that in training time."""
        self.input = input

    #    def train(self, x, y, training_loss, learning_rate,
    def train(self, y, training_loss, learning_rate, n_epochs, train_x, train_y, valid_x, valid_y, batch_size):
        """Train the MLP using SGD"""

        index = T.iscalar()  # index to a [mini]batch
        lr = T.scalar()  # learning rate symbolic

        # index.tag.test_value = 1
        gparams = []
        for param in self.params:
            gparam = T.grad(training_loss, param)
            gparams.append(gparam)

        updates = []

        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * T.cast(lr, dtype=theano.config.floatX)))

        try:
            train_model = theano.function(
                inputs=[index, lr],
                outputs=[training_loss],
                updates=updates,
                givens={
                    self.input: train_x[index * batch_size : (index + 1) * batch_size],
                    y: train_y[index * batch_size : (index + 1) * batch_size],
                },
            )
        except:
            import pdb

            pdb.set_trace()

        validate_model = theano.function(
            inputs=[index],
            outputs=NLL(sigma=self.outputLayer.sigma, mixing=self.outputLayer.mixing, y=y),
            givens={
                self.input: valid_x[index * batch_size : (index + 1) * batch_size],
                y: valid_y[index * batch_size : (index + 1) * batch_size],
            },
        )

        # compute number of minibatches for training and validation
        n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size
        n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size

        validate_MSE = theano.function(
            inputs=[index],
            outputs=MSE(self.samples(), y=y),
            givens={
                self.input: valid_x[index * batch_size : (index + 1) * batch_size],
                y: valid_y[index * batch_size : (index + 1) * batch_size],
            },
        )

        print "training..."

        start_time = time.clock()
        epoch = 0

        total_training_costs = []
        total_validation_costs = []
        total_validation_MSE = []

        lr_time = 0
        lr_step = learning_rate / ((train_x.get_value().shape[0] * 1.0 / batch_size) * (n_epochs - 30))
        lr_val = learning_rate

        while epoch < n_epochs:
            epoch = epoch + 1
            epoch_training_costs = []
            # import pdb; pdb.set_trace()
            for minibatch_index in xrange(n_train_batches):

                # linear annealing after 40 epochs...
                if epoch > 40:
                    # lr_val = learning_rate / (1.0+lr_time)
                    # lr_time = lr_time + 1
                    lr_val = lr_val - lr_step
                else:
                    lr_val = learning_rate

                loss_value = train_model(minibatch_index, lr_val)
                epoch_training_costs.append(loss_value)

                if np.isnan(loss_value):
                    import pdb

                    pdb.set_trace()
                    print "got NaN in NLL"
                    sys.exit(1)

            this_training_cost = np.mean(epoch_training_costs)
            this_validation_cost = np.mean([validate_model(i) for i in xrange(n_valid_batches)])
            this_validation_MSE = np.mean([validate_MSE(i) for i in xrange(n_valid_batches)])

            total_training_costs.append(this_training_cost)
            total_validation_costs.append(this_validation_cost)
            total_validation_MSE.append(this_validation_MSE)

            print "epoch %i, training NLL %f, validation NLL %f, MSE %f" % (
                epoch,
                this_training_cost,
                this_validation_cost,
                this_validation_MSE,
            )

        end_time = time.clock()

        print "Training took %.2f minutes..." % ((end_time - start_time) / 60.0)

        # return losses and parameters..
        return total_training_costs, total_validation_costs, total_validation_MSE

    def samples(self):
        component = self.srng.multinomial(pvals=self.outputLayer.mixing)
        component_std = T.sum(self.outputLayer.sigma * component, axis=1, keepdims=True)

        samples = self.srng.normal(std=component_std)
        return samples

    def save_model(self, filename="MLP.save", output_folder="output_folder"):
        """
        This function pickles the paramaters in a file for later usage
        """
        storage_file = open(os.path.join(output_folder, filename), "wb")
        cPickle.dump(self, storage_file, protocol=cPickle.HIGHEST_PROTOCOL)
        storage_file.close()

    @staticmethod
    def load_model(filename="MLP.save", output_folder="output_folder"):
        """
        This function loads pickled paramaters from a file
        """
        storage_file = open(os.path.join(output_folder, filename), "rb")
        model = cPickle.load(storage_file)
        storage_file.close()
        return model
Exemple #18
0
def MDN_output_layer(x, h, y, in_size, out_size, hidden_size, pred, task):
    if connect_h_to_o:
        if separate_last_hidden:
            dedicated_last_h = h[-1][:, :,
                                     task * dedicated_last_h_size:(task + 1) *
                                     dedicated_last_h_size]
            shared_last_h = h[-1][:, :,
                                  len(game_tasks) * dedicated_last_h_size:]
            shared_last_h_size = hidden_size - len(
                game_tasks) * dedicated_last_h_size
            hiddens = T.concatenate([
                hidden
                for hidden in h[0:-1] + [dedicated_last_h] + [shared_last_h]
            ],
                                    axis=2)
            hidden_out_size = hidden_size * (
                len(h) - specialized_layer_num - 1
            ) + specialized_hidden_size * specialized_layer_num + dedicated_last_h_size + shared_last_h_size
        else:
            hiddens = T.concatenate([hidden for hidden in h], axis=2)
            hidden_out_size = hidden_size * (
                len(h) - specialized_layer_num
            ) + specialized_hidden_size * specialized_layer_num
    else:
        hiddens = h[-1]
        hidden_out_size = hidden_size

    mu_linear = Linear(name='mu_linear' + str(pred),
                       input_dim=hidden_out_size,
                       output_dim=out_size * components_size)
    sigma_linear = Linear(name='sigma_linear' + str(pred),
                          input_dim=hidden_out_size,
                          output_dim=components_size)
    mixing_linear = Linear(name='mixing_linear' + str(pred),
                           input_dim=hidden_out_size,
                           output_dim=components_size)
    initialize([mu_linear, sigma_linear, mixing_linear])
    mu = mu_linear.apply(hiddens)
    mu = mu.reshape((mu.shape[0], mu.shape[1], out_size, components_size))
    sigma = sigma_linear.apply(hiddens)
    sigma = T.nnet.softplus(sigma)
    mixing = mixing_linear.apply(hiddens)
    # apply softmax to mixing
    e_x = T.exp(mixing - mixing.max(axis=2, keepdims=True))
    mixing = e_x / e_x.sum(axis=2, keepdims=True)
    # calculate cost
    exponent = -0.5 * T.inv(sigma) * T.sum(
        (y.dimshuffle(0, 1, 2, 'x') - mu)**2, axis=2)
    normalizer = (2 * np.pi * sigma)
    exponent = exponent + T.log(mixing) - (out_size * .5) * T.log(normalizer)
    # LogSumExp(x)
    max_exponent = T.max(exponent, axis=2, keepdims=True)
    mod_exponent = exponent - max_exponent
    gauss_mix = T.sum(T.exp(mod_exponent), axis=2, keepdims=True)
    log_gauss = T.log(gauss_mix) + max_exponent
    # multiply by the task ( 0 if the cost is not related to this task, 1 otherwise)
    if task_specialized:
        task_index = in_size - len(game_tasks) + task
        log_gauss = T.mul(log_gauss,
                          T.sub(x[:, :, task_index:task_index + 1], 1))
    # mean over the batch, mean over sequence
    cost = -T.mean(log_gauss, axis=1).mean()

    # sampling
    srng = RandomStreams(seed=seed)
    component = srng.multinomial(pvals=mixing)
    component_mean = T.sum(mu * component.dimshuffle(0, 1, 'x', 2), axis=3)
    component_std = T.sum(sigma * component, axis=2, keepdims=True)
    linear_output = srng.normal(avg=component_mean, std=component_std)
    linear_output.name = 'linear_output'
    return linear_output, cost
Exemple #19
0
class SetRBM(object):
    """
    The Restricted Boltzmann Machine learning algorithm.
    """
    def __init__(self, n_visibles, n_hiddens, n_classes,
            W=None, U=None, b=None, c=None, d=None, learning_rate=0.1, K=1):
        self.n_visibles = n_visibles
        self.n_hiddens = n_hiddens
        self.n_classes = n_classes
        
        self.x = T.matrix('x')
        self.y = T.vector('y')
        
        if W is None:
            W_value = numpy.asarray( numpy.random.normal(
                      loc=0,
                      scale=0.01,
                      size = (n_visibles, n_hiddens)),
                      dtype = theano.config.floatX)
            W = theano.shared(value=W_value, name='W')

        if U is None:
            U_value = numpy.asarray( numpy.random.normal(
                      loc=0,
                      scale=0.01,
                      size = (n_classes, n_hiddens)),
                      dtype = theano.config.floatX)
            U = theano.shared(value=U_value, name='W')

        if b is None :
            b = theano.shared(value=numpy.zeros(n_hiddens,
                                dtype=theano.config.floatX), name='b')

        if c is None :
            c = theano.shared(value=numpy.zeros(n_visibles,
                                dtype=theano.config.floatX),name='c')
        
        if d is None :
            d = theano.shared(value=numpy.zeros(n_classes,
                                dtype=theano.config.floatX),name='d')
        
        self.W = W
        self.U = U
        self.b = b
        self.c = c
        self.d = d
        self.params = [self.W, self.U, self.b, self.c, self.d]
        self.theano_rng = RandomStreams(numpy.random.randint(2**30))
        
        self.learning_rate = theano.shared(numpy.asarray(learning_rate,
            dtype=theano.config.floatX))
        self.K = K
        
        cost, updates = self.__train()
        self.train = theano.function([self.x, self.y], cost,
            updates=updates)
        self.trainables = map(lambda x: x, updates)
        
        # TODO need way to compute to marginalize g from y
        #self.transform = theano.function([self.x], self._mean_g(self.x).sum(0))
        self.output = theano.function([self.x], self._output(self.x))
    
    def _free_energy(self, x, y):
        bias_term = T.dot(y, self.d) + T.dot(x, self.c)
        softmax_x = T.log(T.exp(self._softminus(
            T.dot(x, self.W) + self.b)).sum(0))
        hidden_term = T.nnet.softplus(T.dot(y, self.U) + softmax_x).sum()
        
        return -bias_term - hidden_term
    
    def _output(self, x):
        softmax_x = T.log(T.exp(self._softminus(
            T.dot(x, self.W) + self.b)).sum(0))
        output = -T.nnet.softplus(self.U + softmax_x).sum(1)
        
        return T.argmax(T.nnet.softmax(output))
    
    def _softminus(self, x):
        return x - T.nnet.softplus(x)
    
    def _act(self, x, y):
        return self._softminus(self.b + T.dot(x, self.W)) + T.dot(y, self.U)
    
    def _mean_g(self, x, y):
        act = self._act(x, y)
        
        return T.exp(act) / (1. + T.exp(act).sum(0)), 1. / (1. + T.exp(act).sum(0))
    
    def _mean_h(self, g, x):
        return T.maximum(g, T.nnet.sigmoid(T.dot(x, self.W) + self.b))
    
    def _mean_x(self, h):
        return T.dot(h, self.W.T) + self.c
    
    def _mean_y(self, g):
        return T.nnet.softmax(T.dot(g, self.U.T).sum(0) + self.d)
    
    def _sample_g(self, x, y):
        g_mean, g_zeros = self._mean_g(x, y)
        
        g_mean = T.concatenate((g_zeros.dimshuffle('x', 0), g_mean))
        
        g_sample = self.theano_rng.multinomial(n=1, pvals=g_mean.T,
                dtype=theano.config.floatX).T[1:]
        
        return g_sample
    
    def _sample_h(self, g, x):
        h_mean = self._mean_h(g, x)
        
        h_sample = self.theano_rng.binomial(size=h_mean.shape, n=1, p=h_mean,
                dtype=theano.config.floatX)
        
        return h_sample
    
    def _sample_x(self, h):
        x_mean = self._mean_x(h)
        
        x_sample = self.theano_rng.binomial(size=x_mean.shape, n=1, p=x_mean,
                dtype = theano.config.floatX)
        
        return x_sample
    
    def _sample_y(self, g):
        y_mean = self._mean_y(g)
        
        y_sample = self.theano_rng.multinomial(n=1, pvals=y_mean,
                dtype = theano.config.floatX)
        
        return y_sample
    
    def __train(self):
        nx_samples = self.x
        ng_samples = self._sample_g(self.x, self.y)
        for _ in range(self.K):
            nh_samples = self._sample_h(ng_samples, nx_samples)
        
            nx_samples = self._mean_x(nh_samples)
            
            ny_samples = self._sample_y(ng_samples)
            
            ng_samples = self._sample_g(nx_samples, ny_samples)
        
        cost = T.mean(self._free_energy(self.x, self.y)) \
            - T.mean(self._free_energy(nx_samples, ny_samples))
        
        gparams = T.grad(cost, self.params,
            consider_constant=[nx_samples, ny_samples])
        
        updates = {}
        for gparam, param in zip(gparams, self.params):
            updates[param] = param - gparam * T.cast(self.learning_rate,
                dtype=theano.config.floatX)
        
        monitoring_cost = T.nnet.binary_crossentropy(self.y, ny_samples).mean()

        return monitoring_cost, updates
    
    def save(self, tag=None):
        if tag == None:
            tag = ""
        else:
            tag = "_%s" % tag
        
        numpy.save("rbm_W%s.npy" % tag, self.W.get_value(borrow=True))
        numpy.save("rbm_U%s.npy" % tag, self.U.get_value(borrow=True))
        numpy.save("rbm_b%s.npy" % tag, self.b.get_value(borrow=True))
        numpy.save("rbm_c%s.npy" % tag, self.c.get_value(borrow=True))
        numpy.save("rbm_d%s.npy" % tag, self.d.get_value(borrow=True))
class RBMReplSoftmax(RBM):
    def __init__(self, num_vis, num_hid, train_params, from_cache=True):
        self.input = T.matrix('input')

        self.numpy_rng = np.random.RandomState(1)
        self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30))
        self.num_vis = num_vis
        self.num_hid = num_hid

        self.init_params()
        # initialize input layer for standalone RBM or layer0 of DBN

        self.epoch_ratio = theano.shared(np.zeros((1),
                                                  dtype=theano.config.floatX),
                                         borrow=True)
        self.need_train = True
        self.D = T.sum(self.input, axis=1)  #.dimshuffle(0,'x')
        self.params = [self.W, self.hbias, self.vbias]
        _, self.output = self.prop_up(self.input)

        self.hid_means = theano.shared(np.tile(
            np.asarray(train_params['sparse_target'],
                       dtype=theano.config.floatX), self.num_hid),
                                       borrow=True)

        if from_cache:
            self.restore_from_cache(train_params)
        self.watches = []
        self.watches_label = []

    def save_model(self, train_params, path=CACHE_PATH):
        fileName = "rbm_rs_%s_%s_ep%s_sp%s.model" % (
            self.num_vis, self.num_hid, train_params['max_epoch'],
            train_params['sparse_target'])
        fileName = os.path.join(path, fileName)
        save_file = open(fileName,
                         'wb')  # this will overwrite current contents
        cPickle.dump(self.W.get_value(borrow=True), save_file, -1)
        cPickle.dump(self.vbias.get_value(borrow=True), save_file, -1)
        cPickle.dump(self.hbias.get_value(borrow=True), save_file, -1)
        save_file.close()

    def restore_from_cache(self, train_params, path=CACHE_PATH):
        fileName = "rbm_rs_%s_%s_ep%s_sp%s.model" % (
            self.num_vis, self.num_hid, train_params['max_epoch'],
            train_params['sparse_target'])
        fileName = os.path.join(path, fileName)
        if os.path.isfile(fileName):
            fileName_p = open(fileName, 'r')
            self.W.set_value(cPickle.load(fileName_p), borrow=True)
            self.vbias.set_value(cPickle.load(fileName_p), borrow=True)
            self.hbias.set_value(cPickle.load(fileName_p), borrow=True)
            fileName_p.close()
            self.need_train = False
            print "Model file %s was found. rbm.need_train flag turned to False" % fileName
        else:
            print "Model file was not found. Need to call RBM.save_model()"

    def init_W(self):
        initial_W = np.asarray(
            0.001 * self.numpy_rng.randn(self.num_vis, self.num_hid),
            dtype=theano.config.floatX)
        self.W = theano.shared(value=initial_W, name='W', borrow=True)
        self.W_inc = theano.shared(value=np.zeros((self.num_vis, self.num_hid),
                                                  dtype=theano.config.floatX),
                                   name='W_inc',
                                   borrow=True)

    def init_hbias(self):
        self.hbias = theano.shared(value=np.zeros(self.num_hid,
                                                  dtype=theano.config.floatX),
                                   name='hbias',
                                   borrow=True)
        self.hbias_inc = theano.shared(value=np.zeros(
            self.num_hid, dtype=theano.config.floatX),
                                       name='hbias_inc',
                                       borrow=True)

    def init_vbias(self):
        self.vbias = theano.shared(value=np.zeros(self.num_vis,
                                                  dtype=theano.config.floatX),
                                   name='vbias',
                                   borrow=True)
        self.vbias_inc = theano.shared(value=np.zeros(
            self.num_vis, dtype=theano.config.floatX),
                                       name='vbias_inc',
                                       borrow=True)

    def init_params(self):
        self.init_W()
        self.init_vbias()
        self.init_hbias()

    def prop_up(self, vis, D=None):
        if D == None:
            D = self.D
        pre_sigmoid_activation = T.dot(vis, self.W) + T.outer(D, self.hbias)
        return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]

    def prop_down(self, hid):
        pre_softmax_activation = T.dot(hid, self.W.T) + self.vbias
        return [pre_softmax_activation, T.nnet.softmax(pre_softmax_activation)]

    def free_energy(self, v_sample):
        D = T.sum(v_sample, axis=1)
        wx_b = T.dot(v_sample, self.W) + T.outer(D, self.hbias)
        vbias_term = T.dot(v_sample, self.vbias)
        hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
        return -hidden_term - vbias_term

    def sample_v_given_h(self, h_sample, D=None):
        if D == None:
            D = self.D
        pre_softmax_v, v_mean = self.prop_down(h_sample)

        v_mean = v_mean / T.sum(v_mean, axis=1).dimshuffle(0, 'x')
        v_samples, updates = theano.scan(fn=self.multinom_sampler,
                                         non_sequences=[v_mean, D],
                                         n_steps=1)
        self.updates = updates
        #v_sample = T.mean(v_samples, axis=0)
        v_sample = v_samples[-1]
        return [pre_softmax_v, v_mean, v_sample]

    def multinom_sampler(self, probs, D):
        v_sample = self.theano_rng.multinomial(n=D,
                                               pvals=probs,
                                               dtype=theano.config.floatX)
        return v_sample

    def sample_v_given_h_mf(self, h_sample, D=None):
        if D == None:
            D = self.D
        pre_softmax_v, v_mean = self.prop_down(h_sample)
        v_sample = D.dimshuffle(0, 'x') * v_mean
        return [pre_softmax_v, v_mean, v_sample]

    def sample_h_given_v(self, v_sample, D=None):
        if D == None:
            D = self.D
        pre_sigmoid_h, h_mean = self.prop_up(v_sample, D)
        h_sample = self.theano_rng.binomial(size=h_mean.shape,
                                            n=1,
                                            p=h_mean,
                                            dtype=theano.config.floatX)
        return [pre_sigmoid_h, h_mean, h_sample]

    def gibbs_hvh(self, h0_sample):
        pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
        return [
            pre_softmax_v1, v1_mean, v1_sample, pre_sigmoid_h1, h1_mean,
            h1_sample
        ]

    def gibbs_hvh_mf(self, h0_sample):
        pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h_mf(
            h0_sample)
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
        return [
            pre_softmax_v1, v1_mean, v1_sample, pre_sigmoid_h1, h1_mean,
            h1_sample
        ]

    def gibbs_vhv(self, v0_sample, D):
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(
            v0_sample, D)
        pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h(
            h1_sample, D)
        return [
            pre_sigmoid_h1, h1_mean, h1_sample, pre_softmax_v1, v1_mean,
            v1_sample
        ]

    def gibbs_vhv_mf(self, v0_sample, D):
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(
            v0_sample, D)
        pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h_mf(
            h1_sample, D)
        return [
            pre_sigmoid_h1, h1_mean, h1_sample, pre_softmax_v1, v1_mean,
            v1_sample
        ]

    def add_watch(self, w, name):
        self.watches.append(w)
        self.watches_label.append(name)

    def clean_wacthes(self):
        self.watches = []
        self.watches_label = []

    def get_cost_updates(self, train_params):
        l_rate = T.cast(train_params['learning_rate'],
                        dtype=theano.config.floatX)
        weight_decay = T.cast(train_params['weight_decay'],
                              dtype=theano.config.floatX)
        momentum = T.cast(train_params['momentum'], dtype=theano.config.floatX)
        init_momentum = T.cast(train_params['init_momentum'],
                               dtype=theano.config.floatX)
        moment_start = train_params['moment_start']

        batch_size = T.cast(train_params['batch_size'],
                            dtype=theano.config.floatX)
        cd_steps = train_params['cd_steps']
        persistent = train_params['persistent']
        persistent_on = train_params['persistent_on']
        batch_size = T.cast(train_params['batch_size'],
                            dtype=theano.config.floatX)
        sparse_damping = T.cast(train_params['sparse_damping'],
                                dtype=theano.config.floatX)
        sparse_cost = T.cast(train_params['sparse_cost'],
                             dtype=theano.config.floatX)
        sparse_target = T.cast(train_params['sparse_target'],
                               dtype=theano.config.floatX)

        # compute positive phase
        pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input)
        self.add_watch(self.input, "vis_s")
        self.add_watch(ph_mean, "hid_m")

        if persistent_on:
            if T.eq(T.sum(T.sum(persistent, axis=1)), 0):
                chain_start = ph_sample
            else:
                chain_start = persistent
        else:
            chain_start = ph_sample

        if train_params['mean_field']:
            gibbs_hvh_fun = self.gibbs_hvh_mf
        else:
            gibbs_hvh_fun = self.gibbs_hvh

        [pre_softmax_nvs, nv_means, nv_samples,
         pre_sigmoid_nhs, nh_means, nh_samples], updates = \
            theano.scan(gibbs_hvh_fun,
                    outputs_info=[None,  None,  None, None, None, chain_start],
                    n_steps=cd_steps)

        vis_samp_fant = nv_samples[-1]
        hid_probs_fant = nh_means[-1]

        self.add_watch(vis_samp_fant, "neg_vis_s")
        self.add_watch(hid_probs_fant, "neg_hid_m")

        cur_momentum = T.switch(T.lt(self.epoch_ratio[0], moment_start),
                                init_momentum, momentum)
        # sparsity stuff
        hid_means = sparse_damping * self.hid_means + (
            1 - sparse_damping) * T.sum(ph_mean, axis=0) / batch_size
        sparse_grads = sparse_cost * (T.tile(hid_means.dimshuffle('x', 0),
                                             (train_params['batch_size'], 1)) -
                                      sparse_target)

        self.add_watch(hid_means, "hid_means")
        self.add_watch(sparse_grads, "sparse_grads")
        # updates
        W_inc = (T.dot(self.input.T, ph_mean) -
                 T.dot(vis_samp_fant.T, hid_probs_fant) -
                 T.dot(self.input.T,
                       sparse_grads)) / batch_size - self.W * weight_decay
        hbias_inc = (T.sum(ph_mean, axis=0) - T.sum(hid_probs_fant, axis=0) -
                     T.sum(sparse_grads, axis=0)) / batch_size
        #        W_inc = ( T.dot(self.input.T, ph_mean) - T.dot(vis_samp_fant.T, hid_probs_fant) )/batch_size - self.W * weight_decay
        #        hbias_inc = (T.sum(ph_mean, axis=0) - T.sum(hid_probs_fant,axis=0) )/batch_size

        vbias_inc = (T.sum(self.input, axis=0) -
                     T.sum(vis_samp_fant, axis=0)) / batch_size

        W_inc_rate = (self.W_inc * cur_momentum + W_inc) * l_rate
        hbias_inc_rate = (self.hbias_inc * cur_momentum + hbias_inc) * l_rate
        vbias_inc_rate = (self.vbias_inc * cur_momentum + vbias_inc) * l_rate

        updates[self.W] = self.W + W_inc_rate
        updates[self.hbias] = self.hbias + hbias_inc_rate
        updates[self.vbias] = self.vbias + vbias_inc_rate
        updates[self.W_inc] = W_inc
        updates[self.hbias_inc] = hbias_inc
        updates[self.vbias_inc] = vbias_inc
        updates[self.hid_means] = hid_means

        self.add_watch(T.as_tensor_variable(self.W), "W")
        #        self.add_watch(T.as_tensor_variable(self.hbias), "hbias")
        #        self.add_watch(T.as_tensor_variable(self.vbias), "vbias")
        self.add_watch(W_inc_rate, "W_inc")
        #        self.add_watch(hbias_inc_rate, "hbias_inc")
        #        self.add_watch(vbias_inc_rate, "vbias_inc")

        current_free_energy = T.mean(self.free_energy(self.input))
        self.add_watch(T.mean(self.free_energy(self.input)), 'free_en')

        if persistent_on:
            updates[persistent] = nh_samples[-1]
            monitoring_cost = self.get_reconstruction_cost(vis_samp_fant)
        else:
            # reconstruction cross-entropy is a better proxy for CD
            monitoring_cost = self.get_reconstruction_cost(vis_samp_fant)

        self.add_watch(monitoring_cost, "cost")
        return monitoring_cost, current_free_energy, T.mean(
            W_inc_rate), updates

    def get_pseudo_likelihood_cost(self, updates):
        """Stochastic approximation to the pseudo-likelihood"""

        bit_i_idx = theano.shared(value=0, name='bit_i_idx')

        xi = T.round(self.input)

        fe_xi = self.free_energy(xi)
        xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx])

        fe_xi_flip = self.free_energy(xi_flip)

        cost = T.mean(self.num_vis * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)))

        updates[bit_i_idx] = (bit_i_idx + 1) % self.num_vis

        return cost

    def get_reconstruction_cost(self, vis_sample, vis_source=None, D=None):
        if not vis_source:
            return T.sum(
                (T.sum(T.sqr(self.input - vis_sample), axis=1)) / self.D)
        return T.sum((T.sum(T.sqr(vis_source - vis_sample), axis=1)) / D)
										outputs_info=[None],
										non_sequences=[init_multi_samp,Tweights,nsteps],
										n_steps=ntest)
		
		sample_metropolis=theano.function([Tweights, nsteps],Tm_samps,
											allow_input_downcast=True)
		
		
		##setting up Theano sampling =======================================
		
		nummat=np.repeat(np.reshape(np.arange(npcl),(npcl,1)),npcl,axis=1)
		idx_mat=theano.shared(nummat.T)
		
		Tprobs=T.fvector()
		
		t_samp=rng.multinomial(size=Tprobs.shape,pvals=Tprobs)
		idxs=T.cast(T.sum(t_samp*idx_mat,axis=1),'int64')
		
		sample_theano=theano.function([Tprobs],idxs,allow_input_downcast=True)
		
		
		
		## Speed test
		
		weights=np.random.rand(npcl)
		probs=weights/np.sum(weights)
		
		
		
		m_samps=np.zeros((ntest,npcl))
		t_samps=np.zeros((ntest,npcl))
Exemple #22
0
class SRNN(Model):
    def __init__(
            self,
            name,  # a string for identifying model.
            numvis,
            numhid,
            numframes,
            output_type='real',
            cheating_level=.0,  # cheating by lookig at x_t (instead of x_tm1)
            numpy_rng=None,
            theano_rng=None):
        super(SRNN, self).__init__(name=name)

        # store arguments
        self.numvis = numvis
        self.numhid = numhid
        self.numframes = numframes
        self.output_type = output_type
        self.selectionthreshold = 0.0
        self.cheating_level = theano.shared(np.float32(cheating_level))

        if not numpy_rng:
            self.numpy_rng = np.random.RandomState(1)
        else:
            self.numpy_rng = numpy_rng
        if not theano_rng:
            self.theano_rng = RandomStreams(1)
        else:
            self.theano_rng = theano_rng

        # create input var
        self.inputs = T.matrix(name='inputs')

        # set up params
        self.whh = theano.shared(value=np.eye(self.numhid).astype(
            theano.config.floatX),
                                 name='whh')
        self.whx = theano.shared(value=self.numpy_rng.uniform(
            low=-0.01, high=0.01,
            size=(self.numhid, self.numvis)).astype(theano.config.floatX),
                                 name='whx')
        self.wxh = theano.shared(value=self.numpy_rng.uniform(
            low=-0.01, high=0.01,
            size=(self.numvis, self.numhid)).astype(theano.config.floatX),
                                 name='wxh')
        self.bx = theano.shared(
            value=0.0 * np.ones(self.numvis, dtype=theano.config.floatX),
            name='bx')
        self.params = [self.whh, self.whx, self.wxh, self.bx]

        self._batchsize = self.inputs.shape[0]

        # reshape input var from 2D [ Bx(NxT) ] to 3D [ TxBxN ] (time, batch, numvis)
        self._input_frames = self.inputs.reshape(
            (self._batchsize, self.inputs.shape[1] // self.numvis,
             self.numvis)).transpose(1, 0, 2)

        # one-step prediction, used by sampling function
        self.hids_0 = T.zeros((self._batchsize, self.numhid))
        self.hids_1 = T.dot(self.hids_0, self.whh) + T.dot(
            self._input_frames[0], self.wxh)
        self.hids_1 = self.hids_1 * (self.hids_1 > self.selectionthreshold)
        self.x_pred_1 = T.dot(self.hids_1, self.whx) + self.bx

        def step(
                x_gt_t,  # cheating by looking at the current time step input.
                x_tm1,
                hids_tm1):
            pre_hids_t = T.dot(hids_tm1, self.whh) + T.dot(
                self.cheating_level * x_gt_t +
                (1. - self.cheating_level) * x_tm1, self.wxh)
            hids_t = pre_hids_t * (pre_hids_t > self.selectionthreshold)
            x_pred_t = T.dot(hids_t, self.whx) + self.bx
            return x_pred_t, hids_t

        (self._predictions, self.hids), self.updates = theano.scan(
            fn=step,
            sequences=self._input_frames,
            outputs_info=[self._input_frames[0], self.hids_0])

        # set up output prediction
        if self.output_type == 'real':
            self._prediction = self._predictions[:, :, :self.numvis]
        elif self.output_type == 'binary':
            self._prediction = sigmoid(self._predictions[:, :, :self.numvis])
        elif self.output_type == 'softmax':
            # softmax doesn't support 3d tensors, reshape batch and time axis
            # together, apply softmax and reshape back to 3d tensor
            self._prediction = T.nnet.softmax(
                self._predictions[:, :, :self.numvis].reshape(
                    (self._predictions.shape[0] * self._predictions.shape[1],
                     self.numvis))).reshape(
                         (self._predictions.shape[0],
                          self._predictions.shape[1], self.numvis))
        else:
            raise ValueError('unsupported output_type')

        # set cost
        self._prediction_for_training = self._prediction[:self.numframes - 1]
        if self.output_type == 'real':
            self._cost = T.mean((self._prediction_for_training -
                                 self._input_frames[1:self.numframes])**2)
            self._cost_varlen = T.mean(
                (self._prediction - self._input_frames[1:])**2)
        elif self.output_type == 'binary':
            self._cost = -T.mean(self._input_frames[1:self.numframes] *
                                 T.log(self._prediction_for_training) +
                                 (1.0 - self._input_frames[4:self.numframes]) *
                                 T.log(1.0 - self._prediction))
            self._cost_varlen = -T.mean(
                self._input_frames[1:] * T.log(self._prediction_for_training) +
                (1.0 - self._input_frames[1:]) * T.log(1.0 - self._prediction))
        elif self.output_type == 'softmax':
            self._cost = -T.mean(
                T.log(self._prediction_for_training) *
                self._input_frames[1:self.numframes])
            self._cost_varlen = -T.mean(
                T.log(self._prediction) * self._input_frames[1:])

        # set gradients
        self._grads = T.grad(self._cost, self.params)

        # theano function for computing cost and grad
        self.cost = theano.function([self.inputs],
                                    self._cost,
                                    updates=self.updates)
        self.grads = theano.function([self.inputs],
                                     self._grads,
                                     updates=self.updates)

        # another set of variables
        # give some time steps of characters and free the model to predict for all the rest.
        self.inputs_var = T.fmatrix('inputs_var')
        self.nsteps = T.lscalar('nsteps')
        givens = {}
        givens[self.inputs] = T.concatenate(
            (self.inputs_var[:, :self.numvis],
             T.zeros((self.inputs_var.shape[0], self.nsteps * self.numvis))),
            axis=1)

        self.predict = theano.function(
            [
                self.inputs_var,
                theano.Param(self.nsteps, default=self.numframes - 4)
            ],
            self._prediction.transpose(1, 0, 2).reshape(
                (self.inputs_var.shape[0], self.nsteps * self.numvis)),
            updates=self.updates,
            givens=givens)

    def grad(self, x):
        def get_cudandarray_value(x):
            if type(x) == theano.sandbox.cuda.CudaNdarray:
                return np.array(x.__array__()).flatten()
            else:
                return x.flatten()

        return np.concatenate(
            [get_cudandarray_value(g) for g in self.grads(x)])

    def sample(self, numcases=1, numframes=10, temperature=1.0):
        assert self.output_type == 'softmax'
        next_prediction_and_state = theano.function(
            [self._input_frames, self.hids_0], [
                self.theano_rng.multinomial(
                    pvals=T.nnet.softmax(self.x_pred_1 / temperature)),
                self.hids_1
            ])
        preds = np.zeros((numcases, numframes, self.numvis), dtype="float32")
        preds[:, 0, :] = self.numpy_rng.multinomial(
            numcases, pvals=np.ones(self.numvis) / np.float(self.numvis))
        hids = np.zeros((numcases, self.numhid), dtype="float32")
        for t in range(1, numframes):
            nextpredandstate = next_prediction_and_state(
                preds[:, [t - 1], :], hids)
            hids = nextpredandstate[1]
            preds[:, t, :] = nextpredandstate[0]
        return preds
Exemple #23
0
class CharacterRNN(ParameterModel):

    def __init__(self, name, n_input, n_output, n_hidden=10, n_layers=2):
        super(CharacterRNN, self).__init__(name)
        self.n_hidden = n_hidden
        self.n_layers = n_layers

        self.n_input = n_input
        self.n_output = n_output

        self.rng = RandomStreams(seed=1337)
        self.lstm = LSTM('%s-charrnn' % name, self.n_input,
                         n_hidden=self.n_hidden,
                         n_layers=self.n_layers,
                         rng=self.rng)
        self.output = Softmax('%s-softmax' % name, n_hidden, self.n_output)

    def save_parameters(self, location):
        state = {
            'n_hidden': self.n_hidden,
            'n_layers': self.n_layers,
            'lstm': self.lstm.state(),
            'output': self.output.state()
        }
        with open(location, 'wb') as fp:
            pickle.dump(state, fp)

    def load_parameters(self, location):
        with open(location, 'rb') as fp:
            state = pickle.load(fp)

        self.n_hidden = state['n_hidden']
        self.n_layers = state['n_layers']
        self.lstm.load(state['lstm'])
        self.output.load(state['output'])

    @theanify(T.tensor3('X'), T.tensor3('state'), T.tensor3('y'))
    def cost(self, X, state, y):
        (_, state, ypred), updates = self.forward(X, state)
        S, N, V = y.shape
        y = y.reshape((S * N, V))
        ypred = ypred.reshape((S * N, V))
        return (T.nnet.categorical_crossentropy(ypred, y).mean(), state), updates

    def forward(self, X, state):
        S, N, D = X.shape
        H = self.lstm.n_hidden
        L = self.lstm.n_layers
        O = self.output.n_output

        def step(input, previous_hidden, previous_state, previous_output):
            lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state)
            final_output = self.output.forward(lstm_hidden[:, -1, :], 1.0)
            return lstm_hidden, state, final_output

        hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)

        (encoder_output, encoder_state, softmax_output), updates = theano.scan(step,
                              sequences=[X],
                              outputs_info=[
                                            hidden,
                                            state,
                                            T.alloc(np.asarray(0).astype(theano.config.floatX),
                                                    N,
                                                    O),
                                           ],
                              n_steps=S)
        return (encoder_output, encoder_state, softmax_output), updates

    @theanify(T.fvector('start_token'), T.iscalar('length'), T.fscalar('temperature'), returns_updates=True)
    def generate(self, start_token, length, temperature):
        start_token = start_token[:, np.newaxis].T
        N = 1
        H = self.lstm.n_hidden
        L = self.lstm.n_layers

        def step(input, previous_hidden, previous_state, temperature):
            lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state)
            final_output = self.output.forward(lstm_hidden[:, -1, :], temperature)
            sample = self.rng.multinomial(n=1, size=(1,), pvals=final_output, dtype=theano.config.floatX)
            return sample, lstm_hidden, state

        hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)
        state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)

        (softmax_output, _, _), updates = theano.scan(step,
                              outputs_info=[
                                            start_token,
                                            hidden,
                                            state,
                                           ],
                              non_sequences=[temperature],
                              n_steps=length)
        return softmax_output[:, 0, :], updates

    @theanify(T.fvector('start_token'), T.fvector('concat'), T.iscalar('length'), T.fscalar('temperature'), returns_updates=True)
    def generate_with_concat(self, start_token, concat, length, temperature):
        start_token = start_token[:, np.newaxis].T
        concat = concat[:, np.newaxis].T
        N = 1
        H = self.lstm.n_hidden
        L = self.lstm.n_layers

        def step(input, previous_hidden, previous_state, temperature, concat):
            lstm_hidden, state = self.lstm.forward(T.concatenate([input, concat], axis=1),
                                                   previous_hidden, previous_state)
            final_output = self.output.forward(lstm_hidden[:, -1, :], temperature)
            sample = self.rng.multinomial(n=1, size=(1,), pvals=final_output, dtype=theano.config.floatX)
            return sample, lstm_hidden, state

        hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)
        state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)

        (softmax_output, _, _), updates = theano.scan(step,
                              outputs_info=[
                                            start_token,
                                            hidden,
                                            state,
                                           ],
                              non_sequences=[temperature, concat],
                              n_steps=length)
        return softmax_output[:, 0, :], updates

    @theanify(T.fvector('start_token'), T.fvector('concat'), T.iscalar('length'), T.iscalar('num_examples'), T.fscalar('temperature'), returns_updates=True)
    def generate_examples(self, start_token, concat, length, num_examples, temperature):
        start_token = T.tile(start_token[:, np.newaxis].T, (num_examples, 1))
        concat = T.tile(concat[:, np.newaxis].T, (num_examples, 1))
        N = num_examples
        H = self.lstm.n_hidden
        L = self.lstm.n_layers

        def step(input, previous_hidden, previous_state, temperature, concat):
            lstm_hidden, state = self.lstm.forward(T.concatenate([input, concat], axis=1),
                                                   previous_hidden, previous_state)
            final_output = self.output.forward(lstm_hidden[:, -1, :], temperature)
            sample = self.rng.multinomial(n=1, size=(num_examples,), pvals=final_output, dtype=theano.config.floatX)
            return sample, lstm_hidden, state

        hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)
        state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)

        (softmax_output, _, _), updates = theano.scan(step,
                              outputs_info=[
                                            start_token,
                                            hidden,
                                            state,
                                           ],
                              non_sequences=[temperature, concat],
                              n_steps=length)
        return softmax_output[:, :, :], updates


    @theanify(T.tensor3('X'), returns_updates=True)
    def log_probability(self, X):
        S, N, D = X.shape
        H = self.lstm.n_hidden
        L = self.lstm.n_layers
        O = self.n_output

        def step(input, log_prob, previous_hidden, previous_state):
            lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state)
            final_output = self.output.forward(lstm_hidden[:, -1, :], 1.0)
            return final_output, lstm_hidden, state

        hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)
        start_log = T.alloc(np.array(0).astype(theano.config.floatX), N, O)
        state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H))

        (log_prob, _, _), updates = theano.scan(step,
                              sequences=[X],
                              outputs_info=[
                                            start_log,
                                            hidden,
                                            state,
                                           ],
                              n_steps=S)
        return log_prob, updates


    def get_parameters(self):
        return self.lstm.get_parameters() + self.output.get_parameters()
Exemple #24
0
class SetRBM(object):
    """
    The Restricted Boltzmann Machine learning algorithm.
    """
    def __init__(self,
                 n_visibles,
                 n_hiddens,
                 n_classes,
                 W=None,
                 U=None,
                 b=None,
                 c=None,
                 d=None,
                 learning_rate=0.1,
                 K=1):
        self.n_visibles = n_visibles
        self.n_hiddens = n_hiddens
        self.n_classes = n_classes

        self.x = T.matrix('x')
        self.y = T.vector('y')

        if W is None:
            W_value = numpy.asarray(numpy.random.normal(loc=0,
                                                        scale=0.01,
                                                        size=(n_visibles,
                                                              n_hiddens)),
                                    dtype=theano.config.floatX)
            W = theano.shared(value=W_value, name='W')

        if U is None:
            U_value = numpy.asarray(numpy.random.normal(loc=0,
                                                        scale=0.01,
                                                        size=(n_classes,
                                                              n_hiddens)),
                                    dtype=theano.config.floatX)
            U = theano.shared(value=U_value, name='W')

        if b is None:
            b = theano.shared(value=numpy.zeros(n_hiddens,
                                                dtype=theano.config.floatX),
                              name='b')

        if c is None:
            c = theano.shared(value=numpy.zeros(n_visibles,
                                                dtype=theano.config.floatX),
                              name='c')

        if d is None:
            d = theano.shared(value=numpy.zeros(n_classes,
                                                dtype=theano.config.floatX),
                              name='d')

        self.W = W
        self.U = U
        self.b = b
        self.c = c
        self.d = d
        self.params = [self.W, self.U, self.b, self.c, self.d]
        self.theano_rng = RandomStreams(numpy.random.randint(2**30))

        self.learning_rate = theano.shared(
            numpy.asarray(learning_rate, dtype=theano.config.floatX))
        self.K = K

        cost, updates = self.__train()
        self.train = theano.function([self.x, self.y], cost, updates=updates)
        self.trainables = map(lambda x: x, updates)

        # TODO need way to compute to marginalize g from y
        #self.transform = theano.function([self.x], self._mean_g(self.x).sum(0))
        self.output = theano.function([self.x], self._output(self.x))

    def _free_energy(self, x, y):
        bias_term = T.dot(y, self.d) + T.dot(x, self.c)
        softmax_x = T.log(
            T.exp(self._softminus(T.dot(x, self.W) + self.b)).sum(0))
        hidden_term = T.nnet.softplus(T.dot(y, self.U) + softmax_x).sum()

        return -bias_term - hidden_term

    def _output(self, x):
        softmax_x = T.log(
            T.exp(self._softminus(T.dot(x, self.W) + self.b)).sum(0))
        output = -T.nnet.softplus(self.U + softmax_x).sum(1)

        return T.argmax(T.nnet.softmax(output))

    def _softminus(self, x):
        return x - T.nnet.softplus(x)

    def _act(self, x, y):
        return self._softminus(self.b + T.dot(x, self.W)) + T.dot(y, self.U)

    def _mean_g(self, x, y):
        act = self._act(x, y)

        return T.exp(act) / (1. + T.exp(act).sum(0)), 1. / (1. +
                                                            T.exp(act).sum(0))

    def _mean_h(self, g, x):
        return T.maximum(g, T.nnet.sigmoid(T.dot(x, self.W) + self.b))

    def _mean_x(self, h):
        return T.dot(h, self.W.T) + self.c

    def _mean_y(self, g):
        return T.nnet.softmax(T.dot(g, self.U.T).sum(0) + self.d)

    def _sample_g(self, x, y):
        g_mean, g_zeros = self._mean_g(x, y)

        g_mean = T.concatenate((g_zeros.dimshuffle('x', 0), g_mean))

        g_sample = self.theano_rng.multinomial(
            n=1, pvals=g_mean.T, dtype=theano.config.floatX).T[1:]

        return g_sample

    def _sample_h(self, g, x):
        h_mean = self._mean_h(g, x)

        h_sample = self.theano_rng.binomial(size=h_mean.shape,
                                            n=1,
                                            p=h_mean,
                                            dtype=theano.config.floatX)

        return h_sample

    def _sample_x(self, h):
        x_mean = self._mean_x(h)

        x_sample = self.theano_rng.binomial(size=x_mean.shape,
                                            n=1,
                                            p=x_mean,
                                            dtype=theano.config.floatX)

        return x_sample

    def _sample_y(self, g):
        y_mean = self._mean_y(g)

        y_sample = self.theano_rng.multinomial(n=1,
                                               pvals=y_mean,
                                               dtype=theano.config.floatX)

        return y_sample

    def __train(self):
        nx_samples = self.x
        ng_samples = self._sample_g(self.x, self.y)
        for _ in range(self.K):
            nh_samples = self._sample_h(ng_samples, nx_samples)

            nx_samples = self._mean_x(nh_samples)

            ny_samples = self._sample_y(ng_samples)

            ng_samples = self._sample_g(nx_samples, ny_samples)

        cost = T.mean(self._free_energy(self.x, self.y)) \
            - T.mean(self._free_energy(nx_samples, ny_samples))

        gparams = T.grad(cost,
                         self.params,
                         consider_constant=[nx_samples, ny_samples])

        updates = {}
        for gparam, param in zip(gparams, self.params):
            updates[param] = param - gparam * T.cast(
                self.learning_rate, dtype=theano.config.floatX)

        monitoring_cost = T.nnet.binary_crossentropy(self.y, ny_samples).mean()

        return monitoring_cost, updates

    def save(self, tag=None):
        if tag == None:
            tag = ""
        else:
            tag = "_%s" % tag

        numpy.save("rbm_W%s.npy" % tag, self.W.get_value(borrow=True))
        numpy.save("rbm_U%s.npy" % tag, self.U.get_value(borrow=True))
        numpy.save("rbm_b%s.npy" % tag, self.b.get_value(borrow=True))
        numpy.save("rbm_c%s.npy" % tag, self.c.get_value(borrow=True))
        numpy.save("rbm_d%s.npy" % tag, self.d.get_value(borrow=True))
Exemple #25
0
class RBMReplSoftmax(RBM):
    def __init__(self, num_vis, num_hid, train_params, from_cache=True):
        self.input = T.matrix("input")

        self.numpy_rng = np.random.RandomState(1)
        self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30))
        self.num_vis = num_vis
        self.num_hid = num_hid

        self.init_params()
        # initialize input layer for standalone RBM or layer0 of DBN

        self.epoch_ratio = theano.shared(np.zeros((1), dtype=theano.config.floatX), borrow=True)
        self.need_train = True
        self.D = T.sum(self.input, axis=1)  # .dimshuffle(0,'x')
        self.params = [self.W, self.hbias, self.vbias]
        _, self.output = self.prop_up(self.input)

        self.hid_means = theano.shared(
            np.tile(np.asarray(train_params["sparse_target"], dtype=theano.config.floatX), self.num_hid), borrow=True
        )

        if from_cache:
            self.restore_from_cache(train_params)
        self.watches = []
        self.watches_label = []

    def save_model(self, train_params, path=CACHE_PATH):
        fileName = "rbm_rs_%s_%s_ep%s_sp%s.model" % (
            self.num_vis,
            self.num_hid,
            train_params["max_epoch"],
            train_params["sparse_target"],
        )
        fileName = os.path.join(path, fileName)
        save_file = open(fileName, "wb")  # this will overwrite current contents
        cPickle.dump(self.W.get_value(borrow=True), save_file, -1)
        cPickle.dump(self.vbias.get_value(borrow=True), save_file, -1)
        cPickle.dump(self.hbias.get_value(borrow=True), save_file, -1)
        save_file.close()

    def restore_from_cache(self, train_params, path=CACHE_PATH):
        fileName = "rbm_rs_%s_%s_ep%s_sp%s.model" % (
            self.num_vis,
            self.num_hid,
            train_params["max_epoch"],
            train_params["sparse_target"],
        )
        fileName = os.path.join(path, fileName)
        if os.path.isfile(fileName):
            fileName_p = open(fileName, "r")
            self.W.set_value(cPickle.load(fileName_p), borrow=True)
            self.vbias.set_value(cPickle.load(fileName_p), borrow=True)
            self.hbias.set_value(cPickle.load(fileName_p), borrow=True)
            fileName_p.close()
            self.need_train = False
            print "Model file %s was found. rbm.need_train flag turned to False" % fileName
        else:
            print "Model file was not found. Need to call RBM.save_model()"

    def init_W(self):
        initial_W = np.asarray(0.001 * self.numpy_rng.randn(self.num_vis, self.num_hid), dtype=theano.config.floatX)
        self.W = theano.shared(value=initial_W, name="W", borrow=True)
        self.W_inc = theano.shared(
            value=np.zeros((self.num_vis, self.num_hid), dtype=theano.config.floatX), name="W_inc", borrow=True
        )

    def init_hbias(self):
        self.hbias = theano.shared(value=np.zeros(self.num_hid, dtype=theano.config.floatX), name="hbias", borrow=True)
        self.hbias_inc = theano.shared(
            value=np.zeros(self.num_hid, dtype=theano.config.floatX), name="hbias_inc", borrow=True
        )

    def init_vbias(self):
        self.vbias = theano.shared(value=np.zeros(self.num_vis, dtype=theano.config.floatX), name="vbias", borrow=True)
        self.vbias_inc = theano.shared(
            value=np.zeros(self.num_vis, dtype=theano.config.floatX), name="vbias_inc", borrow=True
        )

    def init_params(self):
        self.init_W()
        self.init_vbias()
        self.init_hbias()

    def prop_up(self, vis, D=None):
        if D == None:
            D = self.D
        pre_sigmoid_activation = T.dot(vis, self.W) + T.outer(D, self.hbias)
        return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]

    def prop_down(self, hid):
        pre_softmax_activation = T.dot(hid, self.W.T) + self.vbias
        return [pre_softmax_activation, T.nnet.softmax(pre_softmax_activation)]

    def free_energy(self, v_sample):
        D = T.sum(v_sample, axis=1)
        wx_b = T.dot(v_sample, self.W) + T.outer(D, self.hbias)
        vbias_term = T.dot(v_sample, self.vbias)
        hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
        return -hidden_term - vbias_term

    def sample_v_given_h(self, h_sample, D=None):
        if D == None:
            D = self.D
        pre_softmax_v, v_mean = self.prop_down(h_sample)

        v_mean = v_mean / T.sum(v_mean, axis=1).dimshuffle(0, "x")
        v_samples, updates = theano.scan(fn=self.multinom_sampler, non_sequences=[v_mean, D], n_steps=1)
        self.updates = updates
        # v_sample = T.mean(v_samples, axis=0)
        v_sample = v_samples[-1]
        return [pre_softmax_v, v_mean, v_sample]

    def multinom_sampler(self, probs, D):
        v_sample = self.theano_rng.multinomial(n=D, pvals=probs, dtype=theano.config.floatX)
        return v_sample

    def sample_v_given_h_mf(self, h_sample, D=None):
        if D == None:
            D = self.D
        pre_softmax_v, v_mean = self.prop_down(h_sample)
        v_sample = D.dimshuffle(0, "x") * v_mean
        return [pre_softmax_v, v_mean, v_sample]

    def sample_h_given_v(self, v_sample, D=None):
        if D == None:
            D = self.D
        pre_sigmoid_h, h_mean = self.prop_up(v_sample, D)
        h_sample = self.theano_rng.binomial(size=h_mean.shape, n=1, p=h_mean, dtype=theano.config.floatX)
        return [pre_sigmoid_h, h_mean, h_sample]

    def gibbs_hvh(self, h0_sample):
        pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
        return [pre_softmax_v1, v1_mean, v1_sample, pre_sigmoid_h1, h1_mean, h1_sample]

    def gibbs_hvh_mf(self, h0_sample):
        pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h_mf(h0_sample)
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
        return [pre_softmax_v1, v1_mean, v1_sample, pre_sigmoid_h1, h1_mean, h1_sample]

    def gibbs_vhv(self, v0_sample, D):
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample, D)
        pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample, D)
        return [pre_sigmoid_h1, h1_mean, h1_sample, pre_softmax_v1, v1_mean, v1_sample]

    def gibbs_vhv_mf(self, v0_sample, D):
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample, D)
        pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h_mf(h1_sample, D)
        return [pre_sigmoid_h1, h1_mean, h1_sample, pre_softmax_v1, v1_mean, v1_sample]

    def add_watch(self, w, name):
        self.watches.append(w)
        self.watches_label.append(name)

    def clean_wacthes(self):
        self.watches = []
        self.watches_label = []

    def get_cost_updates(self, train_params):
        l_rate = T.cast(train_params["learning_rate"], dtype=theano.config.floatX)
        weight_decay = T.cast(train_params["weight_decay"], dtype=theano.config.floatX)
        momentum = T.cast(train_params["momentum"], dtype=theano.config.floatX)
        init_momentum = T.cast(train_params["init_momentum"], dtype=theano.config.floatX)
        moment_start = train_params["moment_start"]

        batch_size = T.cast(train_params["batch_size"], dtype=theano.config.floatX)
        cd_steps = train_params["cd_steps"]
        persistent = train_params["persistent"]
        persistent_on = train_params["persistent_on"]
        batch_size = T.cast(train_params["batch_size"], dtype=theano.config.floatX)
        sparse_damping = T.cast(train_params["sparse_damping"], dtype=theano.config.floatX)
        sparse_cost = T.cast(train_params["sparse_cost"], dtype=theano.config.floatX)
        sparse_target = T.cast(train_params["sparse_target"], dtype=theano.config.floatX)

        # compute positive phase
        pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input)
        self.add_watch(self.input, "vis_s")
        self.add_watch(ph_mean, "hid_m")

        if persistent_on:
            if T.eq(T.sum(T.sum(persistent, axis=1)), 0):
                chain_start = ph_sample
            else:
                chain_start = persistent
        else:
            chain_start = ph_sample

        if train_params["mean_field"]:
            gibbs_hvh_fun = self.gibbs_hvh_mf
        else:
            gibbs_hvh_fun = self.gibbs_hvh

        [pre_softmax_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means, nh_samples], updates = theano.scan(
            gibbs_hvh_fun, outputs_info=[None, None, None, None, None, chain_start], n_steps=cd_steps
        )

        vis_samp_fant = nv_samples[-1]
        hid_probs_fant = nh_means[-1]

        self.add_watch(vis_samp_fant, "neg_vis_s")
        self.add_watch(hid_probs_fant, "neg_hid_m")

        cur_momentum = T.switch(T.lt(self.epoch_ratio[0], moment_start), init_momentum, momentum)
        # sparsity stuff
        hid_means = sparse_damping * self.hid_means + (1 - sparse_damping) * T.sum(ph_mean, axis=0) / batch_size
        sparse_grads = sparse_cost * (
            T.tile(hid_means.dimshuffle("x", 0), (train_params["batch_size"], 1)) - sparse_target
        )

        self.add_watch(hid_means, "hid_means")
        self.add_watch(sparse_grads, "sparse_grads")
        # updates
        W_inc = (
            T.dot(self.input.T, ph_mean) - T.dot(vis_samp_fant.T, hid_probs_fant) - T.dot(self.input.T, sparse_grads)
        ) / batch_size - self.W * weight_decay
        hbias_inc = (T.sum(ph_mean, axis=0) - T.sum(hid_probs_fant, axis=0) - T.sum(sparse_grads, axis=0)) / batch_size
        #        W_inc = ( T.dot(self.input.T, ph_mean) - T.dot(vis_samp_fant.T, hid_probs_fant) )/batch_size - self.W * weight_decay
        #        hbias_inc = (T.sum(ph_mean, axis=0) - T.sum(hid_probs_fant,axis=0) )/batch_size

        vbias_inc = (T.sum(self.input, axis=0) - T.sum(vis_samp_fant, axis=0)) / batch_size

        W_inc_rate = (self.W_inc * cur_momentum + W_inc) * l_rate
        hbias_inc_rate = (self.hbias_inc * cur_momentum + hbias_inc) * l_rate
        vbias_inc_rate = (self.vbias_inc * cur_momentum + vbias_inc) * l_rate

        updates[self.W] = self.W + W_inc_rate
        updates[self.hbias] = self.hbias + hbias_inc_rate
        updates[self.vbias] = self.vbias + vbias_inc_rate
        updates[self.W_inc] = W_inc
        updates[self.hbias_inc] = hbias_inc
        updates[self.vbias_inc] = vbias_inc
        updates[self.hid_means] = hid_means

        self.add_watch(T.as_tensor_variable(self.W), "W")
        #        self.add_watch(T.as_tensor_variable(self.hbias), "hbias")
        #        self.add_watch(T.as_tensor_variable(self.vbias), "vbias")
        self.add_watch(W_inc_rate, "W_inc")
        #        self.add_watch(hbias_inc_rate, "hbias_inc")
        #        self.add_watch(vbias_inc_rate, "vbias_inc")

        current_free_energy = T.mean(self.free_energy(self.input))
        self.add_watch(T.mean(self.free_energy(self.input)), "free_en")

        if persistent_on:
            updates[persistent] = nh_samples[-1]
            monitoring_cost = self.get_reconstruction_cost(vis_samp_fant)
        else:
            # reconstruction cross-entropy is a better proxy for CD
            monitoring_cost = self.get_reconstruction_cost(vis_samp_fant)

        self.add_watch(monitoring_cost, "cost")
        return monitoring_cost, current_free_energy, T.mean(W_inc_rate), updates

    def get_pseudo_likelihood_cost(self, updates):
        """Stochastic approximation to the pseudo-likelihood"""

        bit_i_idx = theano.shared(value=0, name="bit_i_idx")

        xi = T.round(self.input)

        fe_xi = self.free_energy(xi)
        xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx])

        fe_xi_flip = self.free_energy(xi_flip)

        cost = T.mean(self.num_vis * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)))

        updates[bit_i_idx] = (bit_i_idx + 1) % self.num_vis

        return cost

    def get_reconstruction_cost(self, vis_sample, vis_source=None, D=None):
        if not vis_source:
            return T.sum((T.sum(T.sqr(self.input - vis_sample), axis=1)) / self.D)
        return T.sum((T.sum(T.sqr(vis_source - vis_sample), axis=1)) / D)
Exemple #26
0
class SRNN(Model):

    def __init__(self, name, numvis, numhid, numlayers, numframes, output_type='real', dropout=0.0, numpy_rng=None, theano_rng=None):
        super(SRNN, self).__init__(name=name)

        self.numvis = numvis            # frame length * alphabet size (1 * 27)
        self.numhid = numhid            # 512
        self.numlayers = numlayers      # 3
        self.numframes = numframes      # maxnumframes (100)
        self.output_type = output_type  # softmax
        self.dropout = dropout          # 0.5

        if not numpy_rng:
            self.numpy_rng = np.random.RandomState(1)
        else:
            self.numpy_rng = numpy_rng
        if not theano_rng:
            self.theano_rng = RandomStreams(1)
        else:
            self.theano_rng = theano_rng

        self.inputs = T.matrix(name='inputs')

        self.whh = [theano.shared(value=np.eye(self.numhid).astype(theano.config.floatX), name='whh'+str(k)) for k in range(self.numlayers)]
        self.whx = [theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numhid, self.numvis)).astype(theano.config.floatX), name='whx'+str(k)) for k in range(self.numlayers)]
        self.wxh = [theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numvis, self.numhid)).astype(theano.config.floatX), name='wxh'+str(0))]
        self.wxh = self.wxh + [theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numhid, self.numhid)).astype(theano.config.floatX), name='wxh'+str(k)) for k in range(self.numlayers-1)]
        self.bx = theano.shared(value=0.0 * np.ones( self.numvis, dtype=theano.config.floatX), name='bx')
        self.bhid = [theano.shared(value=0.0 * np.ones( self.numhid, dtype=theano.config.floatX), name='bhid'+str(k)) for k in range(self.numlayers)]
        self.params = self.whh + self.whx + self.wxh + self.bhid + [self.bx]

        self._batchsize = self.inputs.shape[0]
        self._input_frames = self.inputs.reshape(( self._batchsize, self.inputs.shape[1] // self.numvis, self.numvis)).transpose(1, 0, 2)

        #1-step prediction --- 
        self.hids_0 = T.zeros((self._batchsize, self.numhid*self.numlayers)) 
        self.hids_1 = [T.dot(self.hids_0[:,:self.numhid], self.whh[0]) + self.bhid[0] + T.dot(self._input_frames[0], self.wxh[0])]
        self.hids_1[0] *= (self.hids_1[0] > 0)
        for k in range(1, self.numlayers):
            self.hids_1.append(T.dot(self.hids_0[:,k*self.numhid:(k+1)*self.numhid], self.whh[k]) + self.bhid[k] + T.dot(self.hids_1[k-1], self.wxh[k]))
            self.hids_1[-1] *= (self.hids_1[-1] > 0)

        self.x_pred_1 = self.bx 
        for k in range(self.numlayers):
            self.x_pred_1 += T.dot(self.hids_1[k], self.whx[k]) 
        self.hids_1 = T.concatenate(self.hids_1, 1)
        #--- 1-step prediction 

        def step_dropout(x_gt_t, dropoutmask, x_tm1, hids_tm1):
            hids_tm1 = [hids_tm1[:,k*self.numhid:(k+1)*self.numhid] for k in range(self.numlayers)]
            pre_hids_t = [T.dot(hids_tm1[0], self.whh[0]) + self.bhid[0] + T.dot(x_gt_t, self.wxh[0])]
            hids_t = [pre_hids_t[0] * (pre_hids_t[0] > 0)]
            for k in range(1, self.numlayers):
                pre_hids_t.append(T.dot(hids_tm1[k], self.whh[k]) + self.bhid[k] + T.dot(dropoutmask*hids_t[k-1], (1.0/self.dropout)*self.wxh[k]))
                hids_t.append(pre_hids_t[k] * (pre_hids_t[k] > 0))
            x_pred_t = self.bx
            for k in range(self.numlayers):
                x_pred_t += T.dot(hids_t[k], self.whx[k]) 
            return x_pred_t, T.concatenate(hids_t, 1)

        def step_nodropout(x_gt_t, x_tm1, hids_tm1):
            hids_tm1 = [hids_tm1[:,k*self.numhid:(k+1)*self.numhid] for k in range(self.numlayers)]
            pre_hids_t = [T.dot(hids_tm1[0], self.whh[0]) + self.bhid[0] + T.dot(x_gt_t, self.wxh[0])]
            hids_t = [pre_hids_t[0] * (pre_hids_t[0] > 0)]
            for k in range(1, self.numlayers):
                pre_hids_t.append(T.dot(hids_tm1[k], self.whh[k]) + self.bhid[k] + T.dot(hids_t[k-1], self.wxh[k]))
                hids_t.append(pre_hids_t[k] * (pre_hids_t[k] > 0))
            x_pred_t = self.bx
            for k in range(self.numlayers):
                x_pred_t += T.dot(hids_t[k], self.whx[k]) 
            return x_pred_t, T.concatenate(hids_t, 1)

        if self.dropout == 0.0:
            (self._predictions, self.hids), self.updates = theano.scan(
                                                        fn=step_nodropout,
                                                        sequences=self._input_frames,
                                                        outputs_info=[self._input_frames[0], self.hids_0])
        else:
            self._dropoutmask = theano_rng.binomial(
                size=(self.inputs.shape[1] // self.numvis,
                      self._batchsize, self.numhid),
                n=1, p=self.dropout, dtype=theano.config.floatX
            )
            (self._predictions, self.hids), self.updates = theano.scan(
                                                        fn=step_dropout,
                                                        sequences=[self._input_frames, self._dropoutmask],
                                                        outputs_info=[self._input_frames[0], self.hids_0])

        if self.output_type == 'real':
            self._prediction = self._predictions[:, :, :self.numvis]  # dims: [time step, batch idx, numvis]
        elif self.output_type == 'binary':
            self._prediction = sigmoid(self._predictions[:, :, :self.numvis])
        elif self.output_type == 'softmax':
            # softmax doesn't support 3d tensors, reshape batch and time axis
            # together, apply softmax and reshape back to 3d tensor
            self._prediction = T.nnet.softmax(
                self._predictions[:, :, :self.numvis].reshape((
                    self._predictions.shape[0] * self._predictions.shape[1],
                    self.numvis
                ))
            ).reshape((
                self._predictions.shape[0],
                self._predictions.shape[1],
                self.numvis
            ))
        else:
            raise ValueError('unsupported output_type')

        self._prediction_for_training = self._prediction[:self.numframes-1]

        if self.output_type == 'real':
            self._cost = T.mean(( self._prediction_for_training - self._input_frames[1:self.numframes])**2)
            self._cost_varlen = T.mean(( self._prediction - self._input_frames[1:])**2)  # for various lengths
        elif self.output_type == 'binary':
            self._cost = -T.mean( self._input_frames[1:self.numframes] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:self.numframes]) * T.log( 1.0 - self._prediction))
            self._cost_varlen = -T.mean( self._input_frames[1:] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:]) * T.log( 1.0 - self._prediction))
        elif self.output_type == 'softmax':
            self._cost = -T.mean(T.log( self._prediction_for_training) * self._input_frames[1:self.numframes])
            self._cost_varlen = -T.mean(T.log( self._prediction) * self._input_frames[1:])

        self._grads = T.grad(self._cost, self.params)

        self.inputs_var = T.fmatrix('inputs_var')
        self.nsteps = T.lscalar('nsteps')
        givens = {}
        givens[self.inputs] = T.concatenate(
            ( self.inputs_var[:, :self.numvis],
              T.zeros((self.inputs_var.shape[0], self.nsteps*self.numvis))
            ),
            axis=1)
        
        # predict given the first letters. 
        self.predict = theano.function(
            [self.inputs_var, theano.Param(self.nsteps, default=self.numframes-4)],
            self._prediction.transpose(1, 0, 2).reshape((self.inputs_var.shape[0], self.nsteps*self.numvis)),
            updates=self.updates, givens=givens)
        self.cost = theano.function( [self.inputs], self._cost, updates=self.updates)
        self.grads = theano.function( [self.inputs], self._grads, updates=self.updates)

    def grad(self, x):
        def get_cudandarray_value(x):
            if type(x) == theano.sandbox.cuda.CudaNdarray:
                return np.array(x.__array__()).flatten()
            else:
                return x.flatten()
        return np.concatenate([get_cudandarray_value(g) for g in self.grads(x)])

    def sample(self, numcases=1, numframes=10, temperature=1.0):
        assert self.output_type == 'softmax'
        next_prediction_and_state = theano.function([self._input_frames, self.hids_0], [self.theano_rng.multinomial(pvals=T.nnet.softmax(self.x_pred_1/temperature)), self.hids_1])
        preds = np.zeros((numcases, numframes, self.numvis), dtype="float32")
        preds[:,0,:] = self.numpy_rng.multinomial(numcases, pvals=np.ones(self.numvis)/np.float(self.numvis))
        hids = np.zeros((numcases, self.numhid*self.numlayers), dtype="float32")
        for t in range(1, numframes):
            nextpredandstate = next_prediction_and_state(preds[:,[t-1],:], hids)
            hids = nextpredandstate[1]
            preds[:,t,:] = nextpredandstate[0]
        return preds
Exemple #27
0
class SRNN(Model):
    def __init__(self, name,  # a string for identifying model.
                 numvis, numsz, numrz, numsl, numrl, numframes, output_type='real',
                 cheating_level=.0,  # cheating by lookig at x_t (instead of x_tm1)
                 numpy_rng=None, theano_rng=None):
        super(SRNN, self).__init__(name=name)

        # store arguments
        self.numvis = numvis
        self.numsz = numsz  # stacked zae layer
        self.numrz = numrz  # recurrent zae layer
        self.numsl = numsl  # stacked linear layer
        self.numrl = numrl  # recurrent linear layer
        self.numlayers = 3  # number of total stacked layers
        self.numrecur = 2   # number of recurrent connections
        self.numframes = numframes
        self.output_type = output_type
        self.selectionthreshold = 0.0
        self.cheating_level = theano.shared(numpy.float32(cheating_level))

        if not numpy_rng:
            self.numpy_rng = numpy.random.RandomState(1)
        else:
            self.numpy_rng = numpy_rng
        if not theano_rng:
            self.theano_rng = RandomStreams(1)
        else:
            self.theano_rng = theano_rng

        # create input var
        self.inputs = T.matrix(name='inputs')
        # self.inputs.tag.test_value = numpy.random.rand(20, 27*50).astype(theano.config.floatX)

        # set up params
        # recurrent connections:
        self.whh = [theano.shared(value=self.numpy_rng.uniform(
                            low=-0.01, high=0.01, size=(self.numsz, self.numsz)
                        ).astype(theano.config.floatX), name='whl0'),
                    theano.shared(value=self.numpy_rng.uniform(
                            low=-0.01, high=0.01, size=(self.numsz, self.numsz)
                        ).astype(theano.config.floatX), name='whl1')]

        # vertical connections:
        self.whx = [theano.shared(value=self.numpy_rng.uniform(
                            low=-0.01, high=0.01, size=(self.numsz, self.numvis)
                        ).astype(theano.config.floatX), name='whx0'),
                    theano.shared(value=self.numpy_rng.uniform(
                            low=-0.01, high=0.01, size=(self.numsz, self.numvis)
                        ).astype(theano.config.floatX), name='whx1')]
        self.wxh = [theano.shared(value=self.numpy_rng.uniform(
                            low=-0.01, high=0.01, size=(self.numvis, self.numsz)
                        ).astype(theano.config.floatX), name='wxh0'),
                    theano.shared(value=self.numpy_rng.uniform(
                            low=-0.01, high=0.01, size=(self.numsz, self.numsl)
                        ).astype(theano.config.floatX), name='wxh1'),
                    theano.shared(value=self.numpy_rng.uniform(
                            low=-0.01, high=0.01, size=(self.numsl, self.numsz)
                        ).astype(theano.config.floatX), name='wxh2')]
        self.bx = theano.shared(
            value=0.0 * numpy.ones(self.numvis, dtype=theano.config.floatX),
            name='bx')
        self.params = self.whh + self.whx + self.wxh + [self.bx]

        self._batchsize = self.inputs.shape[0]
        
        # reshape input var from 2D [ Bx(NxT) ] to 3D [ TxBxN ] (time, batch, numvis)
        self._input_frames = self.inputs.reshape((
            self._batchsize,
            self.inputs.shape[1] // self.numvis,
            self.numvis
        )).transpose(1, 0, 2)

        # one-step prediction, used by sampling function
        self.hids_t0 = [T.zeros((self._batchsize, self.numsz)),
                        T.zeros((self._batchsize, self.numsl)),
                        T.zeros((self._batchsize, self.numsz))]
        self.hids_t1 = [ReLU(
            T.dot(self.hids_t0[0], self.whh[0]
            ) + T.dot(self._input_frames[0], self.wxh[0])
        )]
        self.hids_t1.append(T.dot(self.hids_t1[-1], self.wxh[1]))
        self.hids_t1.append(ReLU(
            T.dot(self.hids_t0[2], self.whh[1]
            ) + T.dot(self.hids_t1[-1], self.wxh[2])
        ))

        self.x_pred_1 = self.bx + T.dot(self.hids_t1[0], self.whx[0]) + T.dot(self.hids_t1[2], self.whx[1])
        # end of one-step prediction
        # pdb.set_trace()

        def step(x_tm1, hids_tm1):
            hids_tm1 = [hids_tm1[:,                        : self.numsz              ], 
                        hids_tm1[:,  self.numsz            :(self.numsz  +self.numsl)], 
                        hids_tm1[:, (self.numsz+self.numsl):(self.numsz*2+self.numsl)]]
            hids_t = [ReLU(
                T.dot(hids_tm1[0], self.whh[0]
                ) + T.dot(x_tm1, self.wxh[0])
            )]
            hids_t.append(T.dot(hids_t[-1], self.wxh[1]))
            hids_t.append(ReLU(
                T.dot(hids_tm1[2], self.whh[1]
                ) + T.dot(hids_t[-1], self.wxh[2])
            ))
            x_pred_t = self.bx + T.dot(hids_t[0], self.whx[0]) + T.dot(hids_t[2], self.whx[1])
            return x_pred_t, T.concatenate(hids_t, 1)

        (self._predictions, self.hids), self.updates = theano.scan(
            fn=step,
            sequences=self._input_frames[:-1],
            outputs_info=[None, T.concatenate(self.hids_t0, 1)])

        # set up output prediction
        if self.output_type == 'real':
            self._prediction = self._predictions[:, :, :self.numvis]
        elif self.output_type == 'binary':
            self._prediction = sigmoid(self._predictions[:, :, :self.numvis])
        elif self.output_type == 'softmax':
            # softmax doesn't support 3d tensors, reshape batch and time axis
            # together, apply softmax and reshape back to 3d tensor
            self._prediction = T.nnet.softmax(
                self._predictions[:, :, :self.numvis].reshape((
                    self._predictions.shape[0] * self._predictions.shape[1],
                    self.numvis
                ))
            ).reshape((
                self._predictions.shape[0],
                self._predictions.shape[1],
                self.numvis
            ))
        else:
            raise ValueError('unsupported output_type')

        # set cost
        self._prediction_for_training = self._prediction[:self.numframes-1]
        if self.output_type == 'real':
            self._cost = T.mean((
                self._prediction_for_training -
                self._input_frames[1:self.numframes]
            )**2)
            self._cost_varlen = T.mean((
                self._prediction -
                self._input_frames[1:]
            )**2)
        elif self.output_type == 'binary':
            self._cost = -T.mean(
                self._input_frames[1:self.numframes] *
                T.log(self._prediction_for_training) +
                (1.0 - self._input_frames[4:self.numframes]) * T.log(
                    1.0 - self._prediction))
            self._cost_varlen = -T.mean(
                self._input_frames[1:] *
                T.log(self._prediction_for_training) +
                (1.0 - self._input_frames[1:]) * T.log(
                    1.0 - self._prediction))
        elif self.output_type == 'softmax':
            self._cost = -T.mean(T.log(
                self._prediction_for_training) *
                self._input_frames[1:self.numframes])
            self._cost_varlen = -T.mean(T.log(
                self._prediction) *
                self._input_frames[1:])

        # set gradients
        self._grads = T.grad(self._cost, self.params)

        # theano function for computing cost and grad
        self.cost = theano.function([self.inputs], self._cost,
                                    updates=self.updates)
        self.grads = theano.function([self.inputs], self._grads,
                                     updates=self.updates)
        
        # another set of variables
        # give some time steps of characters and free the model to predict for all the rest.
        self.inputs_var = T.fmatrix('inputs_var')
        self.nsteps = T.lscalar('nsteps')
        givens = {}
        givens[self.inputs] = T.concatenate(
            (self.inputs_var[:, :self.numvis],
             T.zeros((self.inputs_var.shape[0], self.nsteps*self.numvis))
            ),
            axis=1)

        self.predict = theano.function(
            [self.inputs_var, theano.Param(self.nsteps, default=self.numframes-4)],
            self._prediction.transpose(1, 0, 2).reshape((
                self.inputs_var.shape[0], self.nsteps*self.numvis)),
            updates=self.updates,
            givens=givens)

    def grad(self, x):
        def get_cudandarray_value(x):
            if type(x) == theano.sandbox.cuda.CudaNdarray:
                return numpy.array(x.__array__()).flatten()
            else:
                return x.flatten()
        return numpy.concatenate([get_cudandarray_value(g) for g in self.grads(x)])

    def sample(self, numcases=1, numframes=10, temperature=1.0):
        assert self.output_type == 'softmax'
        next_prediction_and_state = theano.function(
            [self._input_frames, T.concatenate(self.hids_t0)],
            [self.theano_rng.multinomial(pvals=T.nnet.softmax(self.x_pred_1/temperature)),
             T.concatenate(self.hids_t1)]
        )
        preds = numpy.zeros((numcases, numframes, self.numvis), dtype="float32")
        preds[:, 0, :] = self.numpy_rng.multinomial(
            numcases, pvals=numpy.ones(self.numvis)/numpy.float(self.numvis))
        hids = numpy.zeros((numcases, self.numhid), dtype="float32")
        for t in range(1, numframes):
            nextpredandstate = next_prediction_and_state(preds[:,[t-1],:], hids)
            hids = nextpredandstate[1]
            preds[:,t,:] = nextpredandstate[0]
        return preds
Exemple #28
0
class MDN(object):
    """Mixture Density Network
    """
    def __init__(self, input, rng, n_in, n_hiddens, hid_activations, n_out,
                 out_activation, n_components):
        """Initialize the parameters for the multilayer perceptron

        :type rng: np.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden_list: list of int
        :param n_hidden_list: a list of number of units in each hidden layer

        :type activations_list: list of lambdas
        :param n_hidden_list: a list of activations used in each hidden layer
        
        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        from theano.tensor.shared_randomstreams import RandomStreams
        self.srng = RandomStreams(seed=1234)

        self.input = input

        # We are dealing with multiple hidden layers MLP
        layer0 = NetworkLayer(rng=rng,
                              input=input,
                              n_in=n_in,
                              n_out=n_hiddens[0],
                              activation=hid_activations[0])

        h_layers = [('hiddenLayer0', layer0)]

        for i in range(1, len(n_hiddens)):
            h_layers.append(('hiddenLayer%d' % i,
                             NetworkLayer(rng=rng,
                                          input=h_layers[i - 1][1].output,
                                          n_in=n_hiddens[i - 1],
                                          n_out=n_hiddens[i],
                                          activation=hid_activations[i])))

        self.__dict__.update(dict(h_layers))

        # The output layer gets as input the hidden units
        # of the hidden layer
        self.outputLayer = MDNoutputLayer(rng=rng,
                                          input=h_layers[-1][1].output,
                                          n_in=n_hiddens[-1],
                                          n_out=n_out,
                                          mu_activation=out_activation,
                                          n_components=n_components)

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = (self.outputLayer.W_mu ** 2).sum() + \
                      (self.outputLayer.W_sigma ** 2).sum() +\
                      (self.outputLayer.W_mixing ** 2).sum()

        for i in range(len(n_hiddens)):
            self.L2_sqr += (self.__dict__['hiddenLayer%d' % i].W**2).sum()

        # the parameters of the model are the parameters of the all layers it
        # is made out of
        params = self.outputLayer.params
        for layer in h_layers:
            params.extend(layer[1].params)
        self.params = params

    def set_symbolic_input(self, input):
        """We use this function to bind a symbolic variable with the input
        of the network layer. Added to specify that in training time."""
        self.input = input


#    def train(self, x, y, training_loss, learning_rate,

    def train(self, y, training_loss, learning_rate, n_epochs, train_x,
              train_y, valid_x, valid_y, batch_size):
        """Train the MLP using SGD"""

        index = T.iscalar()  # index to a [mini]batch
        lr = T.scalar()  # learning rate symbolic

        #index.tag.test_value = 1
        gparams = []
        for param in self.params:
            gparam = T.grad(training_loss, param)
            gparams.append(gparam)

        updates = []

        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * \
                            T.cast(lr,dtype=theano.config.floatX)))

        train_model = theano.function(
            inputs=[index, lr],
            outputs=[training_loss],
            updates=updates,
            givens={
                self.input:
                train_x[index * batch_size:(index + 1) * batch_size],
                y: train_y[index * batch_size:(index + 1) * batch_size]
            })

        validate_model = theano.function(
            inputs=[index],
            outputs=NLL(mu=self.outputLayer.mu,
                        sigma=self.outputLayer.sigma,
                        mixing=self.outputLayer.mixing,
                        y=y),
            givens={
                self.input:
                valid_x[index * batch_size:(index + 1) * batch_size],
                y: valid_y[index * batch_size:(index + 1) * batch_size]
            })

        # compute number of minibatches for training and validation
        n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size
        n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size

        validate_MSE = theano.function(
            inputs=[index],
            outputs=MSE(self.samples(), y=y),
            givens={
                self.input:
                valid_x[index * batch_size:(index + 1) * batch_size],
                y: valid_y[index * batch_size:(index + 1) * batch_size]
            })

        print 'training...'

        start_time = time.clock()
        epoch = 0

        total_training_costs = []
        total_validation_costs = []
        total_validation_MSE = []

        lr_time = 0
        lr_step = learning_rate / (
            (train_x.get_value().shape[0] * 1.0 / batch_size) *
            (n_epochs - 30))
        lr_val = learning_rate

        while (epoch < n_epochs):
            epoch = epoch + 1
            epoch_training_costs = []
            #import pdb; pdb.set_trace()
            for minibatch_index in xrange(n_train_batches):

                # linear annealing after 40 epochs...
                if epoch > 40:
                    # lr_val = learning_rate / (1.0+lr_time)
                    # lr_time = lr_time + 1
                    lr_val = lr_val - lr_step
                else:
                    lr_val = learning_rate


                loss_value = \
                                train_model(minibatch_index, lr_val)
                epoch_training_costs.append(loss_value)

                if np.isnan(loss_value):
                    print 'got NaN in NLL'
                    sys.exit(1)

            this_training_cost = np.mean(epoch_training_costs)
            this_validation_cost = np.mean(
                [validate_model(i) for i in xrange(n_valid_batches)])
            this_validation_MSE = np.mean(
                [validate_MSE(i) for i in xrange(n_valid_batches)])

            total_training_costs.append(this_training_cost)
            total_validation_costs.append(this_validation_cost)
            total_validation_MSE.append(this_validation_MSE)

            print 'epoch %i, training NLL %f, validation NLL %f, MSE %f' %\
            (epoch, this_training_cost,this_validation_cost,
             this_validation_MSE)

        end_time = time.clock()

        print "Training took %.2f minutes..." % ((end_time - start_time) / 60.)

        #return losses and parameters..
        return total_training_costs, total_validation_costs, total_validation_MSE

    def samples(self):
        component = self.srng.multinomial(pvals=self.outputLayer.mixing)
        component_mean =  T.sum(self.outputLayer.mu * \
                                component.dimshuffle(0,'x',1),
                                axis=2)
        component_std = T.sum(self.outputLayer.sigma * \
                              component, axis=1, keepdims=True)

        samples = self.srng.normal(avg=component_mean, std=component_std)
        return samples

    def save_model(self, filename='MLP.save', output_folder='output_folder'):
        """
        This function pickles the paramaters in a file for later usage
        """
        storage_file = open(os.path.join(output_folder, filename), 'wb')
        cPickle.dump(self, storage_file, protocol=cPickle.HIGHEST_PROTOCOL)
        storage_file.close()

    @staticmethod
    def load_model(filename='MLP.save', output_folder='output_folder'):
        """
        This function loads pickled paramaters from a file
        """
        storage_file = open(os.path.join(output_folder, filename), 'rb')
        model = cPickle.load(storage_file)
        storage_file.close()
        return model
Exemple #29
0
class SRNN(Model):
    def __init__(self, name,  # a string for identifying model.
                 numvis, numhid, numframes, output_type='real',
                 cheating_level=.0,  # cheating by lookig at x_t (instead of x_tm1)
                 numpy_rng=None, theano_rng=None):
        super(SRNN, self).__init__(name=name)

        # store arguments
        self.numvis = numvis
        self.numhid = numhid
        self.numframes = numframes
        self.output_type = output_type
        self.selectionthreshold = 0.0
        self.cheating_level = theano.shared(np.float32(cheating_level))

        if not numpy_rng:
            self.numpy_rng = np.random.RandomState(1)
        else:
            self.numpy_rng = numpy_rng
        if not theano_rng:
            self.theano_rng = RandomStreams(1)
        else:
            self.theano_rng = theano_rng

        # create input var
        self.inputs = T.matrix(name='inputs')

        # set up params
        self.whh = theano.shared(
            value=np.eye(self.numhid).astype(theano.config.floatX),
            name='whh')
        self.whx = theano.shared(value=self.numpy_rng.uniform(
            low=-0.01, high=0.01,
            size=(self.numhid, self.numvis)
        ).astype(theano.config.floatX), name='whx')
        self.wxh = theano.shared(value=self.numpy_rng.uniform(
            low=-0.01, high=0.01,
            size=(self.numvis, self.numhid)
        ).astype(theano.config.floatX), name='wxh')
        self.bx = theano.shared(
            value=0.0 * np.ones(self.numvis, dtype=theano.config.floatX),
            name='bx')
        self.params = [self.whh, self.whx, self.wxh, self.bx]

        self._batchsize = self.inputs.shape[0]
        
        # reshape input var from 2D [ Bx(NxT) ] to 3D [ TxBxN ] (time, batch, numvis)
        self._input_frames = self.inputs.reshape((
            self._batchsize,
            self.inputs.shape[1] // self.numvis,
            self.numvis
        )).transpose(1, 0, 2)

        # one-step prediction, used by sampling function
        self.hids_0 = T.zeros((self._batchsize, self.numhid))
        self.hids_1 = T.dot(self.hids_0, self.whh) + T.dot(self._input_frames[0], self.wxh)
        self.hids_1 = self.hids_1 * (self.hids_1 > self.selectionthreshold)
        self.x_pred_1 = T.dot(self.hids_1, self.whx) + self.bx

        def step(x_gt_t,  # cheating by looking at the current time step input.
                 x_tm1, hids_tm1):
            pre_hids_t = T.dot(hids_tm1, self.whh) + T.dot(
                self.cheating_level * x_gt_t + (1.-self.cheating_level) * x_tm1,
                self.wxh)
            hids_t = pre_hids_t * (pre_hids_t > self.selectionthreshold)
            x_pred_t = T.dot(hids_t, self.whx) + self.bx
            return x_pred_t, hids_t

        (self._predictions, self.hids), self.updates = theano.scan(
            fn=step,
            sequences=self._input_frames,
            outputs_info=[self._input_frames[0], self.hids_0])

        # set up output prediction
        if self.output_type == 'real':
            self._prediction = self._predictions[:, :, :self.numvis]
        elif self.output_type == 'binary':
            self._prediction = sigmoid(self._predictions[:, :, :self.numvis])
        elif self.output_type == 'softmax':
            # softmax doesn't support 3d tensors, reshape batch and time axis
            # together, apply softmax and reshape back to 3d tensor
            self._prediction = T.nnet.softmax(
                self._predictions[:, :, :self.numvis].reshape((
                    self._predictions.shape[0] * self._predictions.shape[1],
                    self.numvis
                ))
            ).reshape((
                self._predictions.shape[0],
                self._predictions.shape[1],
                self.numvis
            ))
        else:
            raise ValueError('unsupported output_type')

        # set cost
        self._prediction_for_training = self._prediction[:self.numframes-1]
        if self.output_type == 'real':
            self._cost = T.mean((
                self._prediction_for_training -
                self._input_frames[1:self.numframes]
            )**2)
            self._cost_varlen = T.mean((
                self._prediction -
                self._input_frames[1:]
            )**2)
        elif self.output_type == 'binary':
            self._cost = -T.mean(
                self._input_frames[1:self.numframes] *
                T.log(self._prediction_for_training) +
                (1.0 - self._input_frames[4:self.numframes]) * T.log(
                    1.0 - self._prediction))
            self._cost_varlen = -T.mean(
                self._input_frames[1:] *
                T.log(self._prediction_for_training) +
                (1.0 - self._input_frames[1:]) * T.log(
                    1.0 - self._prediction))
        elif self.output_type == 'softmax':
            self._cost = -T.mean(T.log(
                self._prediction_for_training) *
                self._input_frames[1:self.numframes])
            self._cost_varlen = -T.mean(T.log(
                self._prediction) *
                self._input_frames[1:])

        # set gradients
        self._grads = T.grad(self._cost, self.params)

        # theano function for computing cost and grad
        self.cost = theano.function([self.inputs], self._cost,
                                    updates=self.updates)
        self.grads = theano.function([self.inputs], self._grads,
                                     updates=self.updates)
        
        # another set of variables
        # give some time steps of characters and free the model to predict for all the rest.
        self.inputs_var = T.fmatrix('inputs_var')
        self.nsteps = T.lscalar('nsteps')
        givens = {}
        givens[self.inputs] = T.concatenate(
            (self.inputs_var[:, :self.numvis],
             T.zeros((self.inputs_var.shape[0], self.nsteps*self.numvis))
            ),
            axis=1)

        self.predict = theano.function(
            [self.inputs_var, theano.Param(self.nsteps, default=self.numframes-4)],
            self._prediction.transpose(1, 0, 2).reshape((
                self.inputs_var.shape[0], self.nsteps*self.numvis)),
            updates=self.updates,
            givens=givens)

    def grad(self, x):
        def get_cudandarray_value(x):
            if type(x) == theano.sandbox.cuda.CudaNdarray:
                return np.array(x.__array__()).flatten()
            else:
                return x.flatten()
        return np.concatenate([get_cudandarray_value(g) for g in self.grads(x)])

    def sample(self, numcases=1, numframes=10, temperature=1.0):
        assert self.output_type == 'softmax'
        next_prediction_and_state = theano.function(
            [self._input_frames, self.hids_0],
            [self.theano_rng.multinomial(pvals=T.nnet.softmax(self.x_pred_1/temperature)),
             self.hids_1]
        )
        preds = np.zeros((numcases, numframes, self.numvis), dtype="float32")
        preds[:, 0, :] = self.numpy_rng.multinomial(numcases, pvals=np.ones(self.numvis)/np.float(self.numvis))
        hids = np.zeros((numcases, self.numhid), dtype="float32")
        for t in range(1, numframes):
            nextpredandstate = next_prediction_and_state(preds[:,[t-1],:], hids)
            hids = nextpredandstate[1]
            preds[:,t,:] = nextpredandstate[0]
        return preds
Exemple #30
0
class SRNN(Model):
    def __init__(
            self,
            name,  # a string for identifying model.
            numvis,
            numsz,
            numrz,
            numsl,
            numrl,
            numframes,
            output_type='real',
            cheating_level=.0,  # cheating by lookig at x_t (instead of x_tm1)
            numpy_rng=None,
            theano_rng=None):
        super(SRNN, self).__init__(name=name)

        # store arguments
        self.numvis = numvis
        self.numsz = numsz  # stacked zae layer
        self.numrz = numrz  # recurrent zae layer
        self.numsl = numsl  # stacked linear layer
        self.numrl = numrl  # recurrent linear layer
        self.numlayers = 3  # number of total stacked layers
        self.numrecur = 2  # number of recurrent connections
        self.numframes = numframes
        self.output_type = output_type
        self.selectionthreshold = 0.0
        self.cheating_level = theano.shared(numpy.float32(cheating_level))

        if not numpy_rng:
            self.numpy_rng = numpy.random.RandomState(1)
        else:
            self.numpy_rng = numpy_rng
        if not theano_rng:
            self.theano_rng = RandomStreams(1)
        else:
            self.theano_rng = theano_rng

        # create input var
        self.inputs = T.matrix(name='inputs')

        # set up params

        # recurrent connections:
        # train a pair of orthognal matrices fo each h->l->h connection.
        print "... generating random orthognal matrices"
        # def rand_ortho_np(shape, irange):
        #     A = - irange + 2 * irange * np.random.rand(*shape)
        #     U, s, V = np.linalg.svd(A, full_matrices=True)
        #     return np.dot(U, np.dot( np.eye(U.shape[1], V.shape[0]), V ))
        # np.dot(aaa.T, aaa) = I
        assert self.numsz <= self.numrl
        eye = T.eye(self.numsz)
        var = theano.shared(
            self.numpy_rng.uniform(low=-numpy.sqrt(3. / self.numrl),
                                   high=numpy.sqrt(3. / self.numrl),
                                   size=(self.numsz, self.numrl)).astype(
                                       theano.config.floatX))
        c = T.sum((T.dot(var, var.T) - eye)**2)
        grad = T.grad(c, wrt=var)
        train = theano.function([], c, updates=[(var, var - 0.1 * grad)])

        i = numpy.inf
        while i > 1e-10:
            i = train()
        var0 = var.get_value()

        var.set_value(
            self.numpy_rng.uniform(low=-numpy.sqrt(3. / self.numrl),
                                   high=numpy.sqrt(3. / self.numrl),
                                   size=(self.numsz, self.numrl)).astype(
                                       theano.config.floatX))
        i = numpy.inf
        while i > 1e-10:
            i = train()
        var1 = var.get_value()

        self.whl = [
            theano.shared(value=var0, name='whl0'),
            theano.shared(value=var1, name='whl1')
        ]
        self.wlh = [
            theano.shared(value=var0.T, name='wlh0'),
            theano.shared(value=var1.T, name='wlh1')
        ]
        del var
        print "Done."

        # vertical connections:
        self.whx = [
            theano.shared(value=self.numpy_rng.uniform(
                low=-0.01, high=0.01,
                size=(self.numsz, self.numvis)).astype(theano.config.floatX),
                          name='whx0'),
            theano.shared(value=self.numpy_rng.uniform(
                low=-0.01, high=0.01,
                size=(self.numsz, self.numvis)).astype(theano.config.floatX),
                          name='whx1')
        ]
        self.wxh = [
            theano.shared(value=self.numpy_rng.uniform(
                low=-0.01, high=0.01,
                size=(self.numvis, self.numsz)).astype(theano.config.floatX),
                          name='wxh0'),
            theano.shared(value=self.numpy_rng.uniform(
                low=-0.01, high=0.01,
                size=(self.numsz, self.numsl)).astype(theano.config.floatX),
                          name='wxh1'),
            theano.shared(value=self.numpy_rng.uniform(
                low=-0.01, high=0.01,
                size=(self.numsl, self.numsz)).astype(theano.config.floatX),
                          name='wxh2')
        ]
        self.bx = theano.shared(
            value=0.0 * numpy.ones(self.numvis, dtype=theano.config.floatX),
            name='bx')
        self.params = self.whl + self.wlh + self.whx + self.wxh + [self.bx]

        self._batchsize = self.inputs.shape[0]

        # reshape input var from 2D [ Bx(NxT) ] to 3D [ TxBxN ] (time, batch, numvis)
        self._input_frames = self.inputs.reshape(
            (self._batchsize, self.inputs.shape[1] // self.numvis,
             self.numvis)).transpose(1, 0, 2)

        # one-step prediction, used by sampling function
        self.hids_t0 = [
            T.zeros((self._batchsize, self.numsz)),
            T.zeros((self._batchsize, self.numsl)),
            T.zeros((self._batchsize, self.numsz))
        ]
        self.hids_t1 = [
            ReLU(
                T.dot(T.dot(self.hids_t0[0], self.whl[0]), self.wlh[0]) +
                T.dot(self._input_frames[0], self.wxh[0]))
        ]
        self.hids_t1.append(T.dot(self.hids_t1[-1], self.wxh[1]))
        self.hids_t1.append(
            ReLU(
                T.dot(T.dot(self.hids_t0[2], self.whl[1]), self.wlh[1]) +
                T.dot(self.hids_t1[-1], self.wxh[2])))

        self.x_pred_1 = self.bx + T.dot(self.hids_t1[0], self.whx[0]) + T.dot(
            self.hids_t1[2], self.whx[1])

        # end of one-step prediction

        def step(x_tm1, hids_tm1):
            hids_tm1 = [
                hids_tm1[:, :self.numsz],
                hids_tm1[:, self.numsz:(self.numsz + self.numsl)],
                hids_tm1[:, (self.numsz + self.numsl):(self.numsz * 2 +
                                                       self.numsl)]
            ]
            hids_t = [
                ReLU(
                    T.dot(T.dot(hids_tm1[0], self.whl[0]), self.wlh[0]) +
                    T.dot(x_tm1, self.wxh[0]))
            ]
            hids_t.append(T.dot(hids_t[-1], self.wxh[1]))
            hids_t.append(
                ReLU(
                    T.dot(T.dot(hids_tm1[2], self.whl[1]), self.wlh[1]) +
                    T.dot(hids_t[-1], self.wxh[2])))
            x_pred_t = self.bx + T.dot(hids_t[0], self.whx[0]) + T.dot(
                hids_t[2], self.whx[1])
            return x_pred_t, T.concatenate(hids_t, 1)

        (self._predictions, self.hids), self.updates = theano.scan(
            fn=step,
            sequences=self._input_frames[:49],
            outputs_info=[None, T.concatenate(self.hids_t0, 1)])

        # set up output prediction
        if self.output_type == 'real':
            self._prediction = self._predictions[:, :, :self.numvis]
        elif self.output_type == 'binary':
            self._prediction = sigmoid(self._predictions[:, :, :self.numvis])
        elif self.output_type == 'softmax':
            # softmax doesn't support 3d tensors, reshape batch and time axis
            # together, apply softmax and reshape back to 3d tensor
            self._prediction = T.nnet.softmax(
                self._predictions[:, :, :self.numvis].reshape(
                    (self._predictions.shape[0] * self._predictions.shape[1],
                     self.numvis))).reshape(
                         (self._predictions.shape[0],
                          self._predictions.shape[1], self.numvis))
        else:
            raise ValueError('unsupported output_type')

        # set cost
        self._prediction_for_training = self._prediction[:self.numframes - 1]
        if self.output_type == 'real':
            self._cost = T.mean((self._prediction_for_training -
                                 self._input_frames[1:self.numframes])**2)
            self._cost_varlen = T.mean(
                (self._prediction - self._input_frames[1:])**2)
        elif self.output_type == 'binary':
            self._cost = -T.mean(self._input_frames[1:self.numframes] *
                                 T.log(self._prediction_for_training) +
                                 (1.0 - self._input_frames[4:self.numframes]) *
                                 T.log(1.0 - self._prediction))
            self._cost_varlen = -T.mean(
                self._input_frames[1:] * T.log(self._prediction_for_training) +
                (1.0 - self._input_frames[1:]) * T.log(1.0 - self._prediction))
        elif self.output_type == 'softmax':
            self._cost = -T.mean(
                T.log(self._prediction_for_training) *
                self._input_frames[1:self.numframes])
            self._cost_varlen = -T.mean(
                T.log(self._prediction) * self._input_frames[1:])

        # set gradients
        self._grads = T.grad(self._cost, self.params)

        # theano function for computing cost and grad
        self.cost = theano.function([self.inputs],
                                    self._cost,
                                    updates=self.updates)
        self.grads = theano.function([self.inputs],
                                     self._grads,
                                     updates=self.updates)

        # another set of variables
        # give some time steps of characters and free the model to predict for all the rest.
        self.inputs_var = T.fmatrix('inputs_var')
        self.nsteps = T.lscalar('nsteps')
        givens = {}
        givens[self.inputs] = T.concatenate(
            (self.inputs_var[:, :self.numvis],
             T.zeros((self.inputs_var.shape[0], self.nsteps * self.numvis))),
            axis=1)

        self.predict = theano.function(
            [
                self.inputs_var,
                theano.Param(self.nsteps, default=self.numframes - 4)
            ],
            self._prediction.transpose(1, 0, 2).reshape(
                (self.inputs_var.shape[0], self.nsteps * self.numvis)),
            updates=self.updates,
            givens=givens)

    def grad(self, x):
        def get_cudandarray_value(x):
            if type(x) == theano.sandbox.cuda.CudaNdarray:
                return numpy.array(x.__array__()).flatten()
            else:
                return x.flatten()

        return numpy.concatenate(
            [get_cudandarray_value(g) for g in self.grads(x)])

    def sample(self, numcases=1, numframes=10, temperature=1.0):
        assert self.output_type == 'softmax'
        next_prediction_and_state = theano.function(
            [self._input_frames,
             T.concatenate(self.hids_t0)], [
                 self.theano_rng.multinomial(
                     pvals=T.nnet.softmax(self.x_pred_1 / temperature)),
                 T.concatenate(self.hids_t1)
             ])
        preds = numpy.zeros((numcases, numframes, self.numvis),
                            dtype="float32")
        preds[:, 0, :] = self.numpy_rng.multinomial(
            numcases, pvals=numpy.ones(self.numvis) / numpy.float(self.numvis))
        hids = numpy.zeros((numcases, self.numhid), dtype="float32")
        for t in range(1, numframes):
            nextpredandstate = next_prediction_and_state(
                preds[:, [t - 1], :], hids)
            hids = nextpredandstate[1]
            preds[:, t, :] = nextpredandstate[0]
        return preds
Exemple #31
0
class CharacterRNN(ParameterModel):

    def __init__(self, name, n_input, n_output, n_hidden=10, n_layers=2,
                 seed=None):
        super(CharacterRNN, self).__init__(name)
        self.n_hidden = n_hidden
        self.n_layers = n_layers

        self.n_input = n_input
        self.n_output = n_output

        self.lstm = MultilayerLSTM('%s-charrnn' % name, self.n_input,
                         n_hidden=self.n_hidden,
                         n_layers=self.n_layers,
                         )
        self.rng = RandomStreams(seed)

        self.output = Softmax('%s-softmax' % name, n_hidden, self.n_output)

    def save_parameters(self, location):
        state = {
            'n_hidden': self.n_hidden,
            'n_layers': self.n_layers,
            'lstm': self.lstm.state(),
            'output': self.output.state()
        }
        with open(location, 'wb') as fp:
            pickle.dump(state, fp)

    def load_parameters(self, location):
        with open(location, 'rb') as fp:
            state = pickle.load(fp)

        self.n_hidden = state['n_hidden']
        self.n_layers = state['n_layers']
        self.lstm.load(state['lstm'])
        self.output.load(state['output'])

    @theanify(T.tensor3('X'), T.tensor3('state'), T.tensor3('y'), returns_updates=True)
    def cost(self, X, state, y):
        (_, state, ypred), updates = self.forward(X, state)
        S, N, V = y.shape
        y = y.reshape((S * N, V))
        ypred = ypred.reshape((S * N, V))
        return (T.nnet.categorical_crossentropy(ypred, y).mean(), state), updates

    def forward(self, X, state):
        S, N, D = X.shape
        H = self.lstm.n_hidden
        L = self.lstm.n_layers
        O = self.output.n_output

        def step(input, previous_hidden, previous_state, previous_output):
            lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state)
            final_output = self.output.forward(lstm_hidden[:, -1, :], 1.0)
            return lstm_hidden, state, final_output

        hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)

        (encoder_output, encoder_state, softmax_output), updates = theano.scan(step,
                              sequences=[X],
                              outputs_info=[
                                            hidden,
                                            state,
                                            T.alloc(np.asarray(0).astype(theano.config.floatX),
                                                    N,
                                                    O),
                                           ],
                              n_steps=S)
        return (encoder_output, encoder_state, softmax_output), updates

    @theanify(T.vector('start_token'), T.iscalar('length'), T.scalar('temperature'), returns_updates=True)
    def generate(self, start_token, length, temperature):
        start_token = start_token[:, np.newaxis].T
        N = 1
        H = self.lstm.n_hidden
        L = self.lstm.n_layers

        def step(input, previous_hidden, previous_state, temperature):
            lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state)
            final_output = self.output.forward(lstm_hidden[:, -1, :], temperature)
            sample = self.rng.multinomial(n=1, size=(1,), pvals=final_output, dtype=theano.config.floatX)
            return sample, lstm_hidden, state

        hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)
        state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)

        (softmax_output, _, _), updates = theano.scan(step,
                              outputs_info=[
                                            start_token,
                                            hidden,
                                            state,
                                           ],
                              non_sequences=[temperature],
                              n_steps=length)
        return softmax_output[:, 0, :], updates

    @theanify(T.fvector('start_token'), T.fvector('concat'), T.iscalar('length'), T.fscalar('temperature'), returns_updates=True)
    def generate_with_concat(self, start_token, concat, length, temperature):
        start_token = start_token[:, np.newaxis].T
        concat = concat[:, np.newaxis].T
        N = 1
        H = self.lstm.n_hidden
        L = self.lstm.n_layers

        def step(input, previous_hidden, previous_state, temperature, concat):
            lstm_hidden, state = self.lstm.forward(T.concatenate([input, concat], axis=1),
                                                   previous_hidden, previous_state)
            final_output = self.output.forward(lstm_hidden[:, -1, :], temperature)
            sample = self.rng.multinomial(n=1, size=(1,), pvals=final_output, dtype=theano.config.floatX)
            return sample, lstm_hidden, state

        hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)
        state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)

        (softmax_output, _, _), updates = theano.scan(step,
                              outputs_info=[
                                            start_token,
                                            hidden,
                                            state,
                                           ],
                              non_sequences=[temperature, concat],
                              n_steps=length)
        return softmax_output[:, 0, :], updates

    @theanify(T.tensor3('X'), returns_updates=True)
    def log_probability(self, X):
        S, N, D = X.shape
        H = self.lstm.n_hidden
        L = self.lstm.n_layers
        O = self.n_output

        def step(input, log_prob, previous_hidden, previous_state):
            lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state)
            final_output = self.output.forward(lstm_hidden[:, -1, :], 1.0)
            return final_output, lstm_hidden, state

        hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1)
        start_log = T.alloc(np.array(0).astype(theano.config.floatX), N, O)
        state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H))

        (log_prob, _, _), updates = theano.scan(step,
                              sequences=[X],
                              outputs_info=[
                                            start_log,
                                            hidden,
                                            state,
                                           ],
                              n_steps=S)
        return log_prob, updates

    def get_parameters(self):
        return self.lstm.get_parameters() + self.output.get_parameters()
Exemple #32
0
class SRNN(Model):
    def __init__(self,
                 name,
                 numvis,
                 numhid,
                 numlayers,
                 numframes,
                 output_type='real',
                 dropout=0.0,
                 numpy_rng=None,
                 theano_rng=None):
        super(SRNN, self).__init__(name=name)

        self.numvis = numvis  # frame length * alphabet size (1 * 27)
        self.numhid = numhid  # 512
        self.numlayers = numlayers  # 3
        self.numframes = numframes  # maxnumframes (100)
        self.output_type = output_type  # softmax
        self.dropout = dropout  # 0.5

        if not numpy_rng:
            self.numpy_rng = np.random.RandomState(1)
        else:
            self.numpy_rng = numpy_rng
        if not theano_rng:
            self.theano_rng = RandomStreams(1)
        else:
            self.theano_rng = theano_rng

        self.inputs = T.matrix(name='inputs')

        self.whh = [
            theano.shared(value=np.eye(self.numhid).astype(
                theano.config.floatX),
                          name='whh' + str(k)) for k in range(self.numlayers)
        ]
        self.whx = [
            theano.shared(value=self.numpy_rng.uniform(
                low=-0.01, high=0.01,
                size=(self.numhid, self.numvis)).astype(theano.config.floatX),
                          name='whx' + str(k)) for k in range(self.numlayers)
        ]
        self.wxh = [
            theano.shared(value=self.numpy_rng.uniform(
                low=-0.01, high=0.01,
                size=(self.numvis, self.numhid)).astype(theano.config.floatX),
                          name='wxh' + str(0))
        ]
        self.wxh = self.wxh + [
            theano.shared(value=self.numpy_rng.uniform(
                low=-0.01, high=0.01, size=(self.numhid, self.numhid)).astype(
                    theano.config.floatX),
                          name='wxh' + str(k))
            for k in range(self.numlayers - 1)
        ]
        self.bx = theano.shared(
            value=0.0 * np.ones(self.numvis, dtype=theano.config.floatX),
            name='bx')
        self.bhid = [
            theano.shared(value=0.0 *
                          np.ones(self.numhid, dtype=theano.config.floatX),
                          name='bhid' + str(k)) for k in range(self.numlayers)
        ]
        self.params = self.whh + self.whx + self.wxh + self.bhid + [self.bx]

        self._batchsize = self.inputs.shape[0]
        self._input_frames = self.inputs.reshape(
            (self._batchsize, self.inputs.shape[1] // self.numvis,
             self.numvis)).transpose(1, 0, 2)

        #1-step prediction ---
        self.hids_0 = T.zeros((self._batchsize, self.numhid * self.numlayers))
        self.hids_1 = [
            T.dot(self.hids_0[:, :self.numhid], self.whh[0]) + self.bhid[0] +
            T.dot(self._input_frames[0], self.wxh[0])
        ]
        self.hids_1[0] *= (self.hids_1[0] > 0)
        for k in range(1, self.numlayers):
            self.hids_1.append(
                T.dot(self.hids_0[:, k * self.numhid:(k + 1) *
                                  self.numhid], self.whh[k]) + self.bhid[k] +
                T.dot(self.hids_1[k - 1], self.wxh[k]))
            self.hids_1[-1] *= (self.hids_1[-1] > 0)

        self.x_pred_1 = self.bx
        for k in range(self.numlayers):
            self.x_pred_1 += T.dot(self.hids_1[k], self.whx[k])
        self.hids_1 = T.concatenate(self.hids_1, 1)

        #--- 1-step prediction

        def step_dropout(x_gt_t, dropoutmask, x_tm1, hids_tm1):
            hids_tm1 = [
                hids_tm1[:, k * self.numhid:(k + 1) * self.numhid]
                for k in range(self.numlayers)
            ]
            pre_hids_t = [
                T.dot(hids_tm1[0], self.whh[0]) + self.bhid[0] +
                T.dot(x_gt_t, self.wxh[0])
            ]
            hids_t = [pre_hids_t[0] * (pre_hids_t[0] > 0)]
            for k in range(1, self.numlayers):
                pre_hids_t.append(
                    T.dot(hids_tm1[k], self.whh[k]) + self.bhid[k] +
                    T.dot(dropoutmask * hids_t[k - 1], (1.0 / self.dropout) *
                          self.wxh[k]))
                hids_t.append(pre_hids_t[k] * (pre_hids_t[k] > 0))
            x_pred_t = self.bx
            for k in range(self.numlayers):
                x_pred_t += T.dot(hids_t[k], self.whx[k])
            return x_pred_t, T.concatenate(hids_t, 1)

        def step_nodropout(x_gt_t, x_tm1, hids_tm1):
            hids_tm1 = [
                hids_tm1[:, k * self.numhid:(k + 1) * self.numhid]
                for k in range(self.numlayers)
            ]
            pre_hids_t = [
                T.dot(hids_tm1[0], self.whh[0]) + self.bhid[0] +
                T.dot(x_gt_t, self.wxh[0])
            ]
            hids_t = [pre_hids_t[0] * (pre_hids_t[0] > 0)]
            for k in range(1, self.numlayers):
                pre_hids_t.append(
                    T.dot(hids_tm1[k], self.whh[k]) + self.bhid[k] +
                    T.dot(hids_t[k - 1], self.wxh[k]))
                hids_t.append(pre_hids_t[k] * (pre_hids_t[k] > 0))
            x_pred_t = self.bx
            for k in range(self.numlayers):
                x_pred_t += T.dot(hids_t[k], self.whx[k])
            return x_pred_t, T.concatenate(hids_t, 1)

        if self.dropout == 0.0:
            (self._predictions, self.hids), self.updates = theano.scan(
                fn=step_nodropout,
                sequences=self._input_frames,
                outputs_info=[self._input_frames[0], self.hids_0])
        else:
            self._dropoutmask = theano_rng.binomial(
                size=(self.inputs.shape[1] // self.numvis, self._batchsize,
                      self.numhid),
                n=1,
                p=self.dropout,
                dtype=theano.config.floatX)
            (self._predictions, self.hids), self.updates = theano.scan(
                fn=step_dropout,
                sequences=[self._input_frames, self._dropoutmask],
                outputs_info=[self._input_frames[0], self.hids_0])

        if self.output_type == 'real':
            self._prediction = self._predictions[:, :, :self.
                                                 numvis]  # dims: [time step, batch idx, numvis]
        elif self.output_type == 'binary':
            self._prediction = sigmoid(self._predictions[:, :, :self.numvis])
        elif self.output_type == 'softmax':
            # softmax doesn't support 3d tensors, reshape batch and time axis
            # together, apply softmax and reshape back to 3d tensor
            self._prediction = T.nnet.softmax(
                self._predictions[:, :, :self.numvis].reshape(
                    (self._predictions.shape[0] * self._predictions.shape[1],
                     self.numvis))).reshape(
                         (self._predictions.shape[0],
                          self._predictions.shape[1], self.numvis))
        else:
            raise ValueError('unsupported output_type')

        self._prediction_for_training = self._prediction[:self.numframes - 1]

        if self.output_type == 'real':
            self._cost = T.mean((self._prediction_for_training -
                                 self._input_frames[1:self.numframes])**2)
            self._cost_varlen = T.mean(
                (self._prediction -
                 self._input_frames[1:])**2)  # for various lengths
        elif self.output_type == 'binary':
            self._cost = -T.mean(self._input_frames[1:self.numframes] *
                                 T.log(self._prediction_for_training) +
                                 (1.0 - self._input_frames[1:self.numframes]) *
                                 T.log(1.0 - self._prediction))
            self._cost_varlen = -T.mean(
                self._input_frames[1:] * T.log(self._prediction_for_training) +
                (1.0 - self._input_frames[1:]) * T.log(1.0 - self._prediction))
        elif self.output_type == 'softmax':
            self._cost = -T.mean(
                T.log(self._prediction_for_training) *
                self._input_frames[1:self.numframes])
            self._cost_varlen = -T.mean(
                T.log(self._prediction) * self._input_frames[1:])

        self._grads = T.grad(self._cost, self.params)

        self.inputs_var = T.fmatrix('inputs_var')
        self.nsteps = T.lscalar('nsteps')
        givens = {}
        givens[self.inputs] = T.concatenate(
            (self.inputs_var[:, :self.numvis],
             T.zeros((self.inputs_var.shape[0], self.nsteps * self.numvis))),
            axis=1)

        # predict given the first letters.
        self.predict = theano.function(
            [
                self.inputs_var,
                theano.Param(self.nsteps, default=self.numframes - 4)
            ],
            self._prediction.transpose(1, 0, 2).reshape(
                (self.inputs_var.shape[0], self.nsteps * self.numvis)),
            updates=self.updates,
            givens=givens)
        self.cost = theano.function([self.inputs],
                                    self._cost,
                                    updates=self.updates)
        self.grads = theano.function([self.inputs],
                                     self._grads,
                                     updates=self.updates)

    def grad(self, x):
        def get_cudandarray_value(x):
            if type(x) == theano.sandbox.cuda.CudaNdarray:
                return np.array(x.__array__()).flatten()
            else:
                return x.flatten()

        return np.concatenate(
            [get_cudandarray_value(g) for g in self.grads(x)])

    def sample(self, numcases=1, numframes=10, temperature=1.0):
        assert self.output_type == 'softmax'
        next_prediction_and_state = theano.function(
            [self._input_frames, self.hids_0], [
                self.theano_rng.multinomial(
                    pvals=T.nnet.softmax(self.x_pred_1 / temperature)),
                self.hids_1
            ])
        preds = np.zeros((numcases, numframes, self.numvis), dtype="float32")
        preds[:, 0, :] = self.numpy_rng.multinomial(
            numcases, pvals=np.ones(self.numvis) / np.float(self.numvis))
        hids = np.zeros((numcases, self.numhid * self.numlayers),
                        dtype="float32")
        for t in range(1, numframes):
            nextpredandstate = next_prediction_and_state(
                preds[:, [t - 1], :], hids)
            hids = nextpredandstate[1]
            preds[:, t, :] = nextpredandstate[0]
        return preds