Example #1
0
    def pretraining_functions(self, train_set_x, batch_size, k):
        '''Generates a list of functions, for performing one step of
        gradient descent at a given layer. The function will require
        as input the minibatch index, and to train an RBM you just
        need to iterate, calling the corresponding function on all
        minibatch indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared var. that contains all datapoints used
                            for training the RBM
        :type batch_size: int
        :param batch_size: size of a [mini]batch
        :param k: number of Gibbs steps to do in CD-k / PCD-k

        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        learning_rate = T.scalar('lr')  # learning rate to use

        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for rbm in self.rbm_layers:

            # get the cost and the updates list
            # using CD-k here (persisent=None) for training each RBM.
            # TODO: change cost function to reconstruction error
            cost, updates = rbm.get_cost_updates(learning_rate,
                                                 persistent=None, k=k)

            # compile the theano function
            fn = theano.function(
                inputs=[index, theano.In(learning_rate, value=0.1)],
                outputs=cost,
                updates=updates,
                givens={
                    self.x: train_set_x[batch_begin:batch_end]
                }
            )
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns
Example #2
0
def create_decoder_func(layers):
    Z = T.fmatrix('Z')
    Z_batch = T.fmatrix('Z_batch')

    X = get_output(layers['l_decoder_out'],
                   inputs={layers['l_encoder_out']: Z},
                   deterministic=True)

    decoder_func = theano.function(
        inputs=[theano.In(Z_batch)],
        outputs=X,
        givens={
            Z: Z_batch,
        },
    )

    return decoder_func
Example #3
0
 def pretraining_functions(self, train_set_x, batch_size, k):
     index = T.lscalar('index')
     learning_rate = T.scalar('lr')
     batch_begin = index * batch_size
     batch_end = batch_begin + batch_size
     pretrain_fns = []
     for rbm in self.rbm_layers:
         cost, updates = rbm.get_cost_updates(learning_rate,
                                              persistent=None,
                                              k=k)
         fn = theano.function(
             inputs=[index, theano.In(learning_rate, value=0.1)],
             outputs=cost,
             updates=updates,
             givens={self.x: train_set_x[batch_begin:batch_end]})
         pretrain_fns.append(fn)
     return pretrain_fns
Example #4
0
def test_vm_gc():
    # This already caused a bug in the trunk of Theano.
    #
    # The bug was introduced in the trunk on July 5th, 2012 and fixed on
    # July 30th.

    x = theano.tensor.vector()
    p = RunOnce()(x)
    mode = theano.Mode(linker=theano.gof.vm.VM_Linker(lazy=True))
    f = theano.function([theano.In(x, mutable=True)], [p + 1, p + 2],
                        mode=mode)
    f([1, 2, 3])

    p = RunOnce()(x)
    pp = p + p
    f = theano.function([x], [pp + pp], mode=mode)
    f([1, 2, 3])
Example #5
0
def test_remove0():
    print
    print 'test_remove0()'
    configs = [
        # structure type, numpy matching class
        ('csc', scipy.sparse.csc_matrix),
        ('csr', scipy.sparse.csr_matrix),
    ]
    for format, matrix_class in configs:
        print 'config: format=\'%(format)s\', matrix_class=%(matrix_class)s' % locals(
        )
        # real
        origin = (numpy.arange(9) + 1).reshape(
            (3, 3)).astype(theano.config.floatX)
        mat = matrix_class(origin).astype(theano.config.floatX)

        mat[0, 1] = mat[1, 0] = mat[2, 2] = 0

        assert mat.size == 9

        # symbolic
        x = theano.sparse.SparseType(format=format,
                                     dtype=theano.config.floatX)()
        # the In thingy has to be there because theano has as rule not to optimize inputs
        f = theano.function([theano.In(x, borrow=True, mutable=True)],
                            sp.Remove0()(x))

        # assert optimization is applied in modes with optimization
        if theano.config.mode not in ['FAST_COMPILE']:
            # list of apply nodes in the optimized graph.
            nodes = f.maker.env.toposort()
            v = [
                True for node in nodes
                if isinstance(node.op, sp.Remove0) and node.op.inplace
            ]
            assert len(v), 'Inplacing optimization should have been applied.'

        # checking
        # makes sense to change its name
        target = mat
        result = f(mat)
        mat.eliminate_zeros()
        assert result.size == target.size, 'Matrices sizes differ. Have zeros been removed ?'
Example #6
0
    def test_partial_input_aliasing_affecting_inplace_operations(self):

        # Note: to trigger this bug with theano rev 4586:2bc6fc7f218b,
        #        you need to make in inputs mutable ( so that inplace
        #        operations are used) and to break the elemwise composition
        #        with some non-elemwise op ( here dot )
        x = theano.tensor.dvector()
        y = theano.tensor.dvector()
        z = theano.tensor.dvector()
        m1 = theano.tensor.dmatrix()
        m2 = theano.tensor.dmatrix()
        m3 = theano.tensor.dmatrix()

        # Test 2. If variables only partial overlap
        #   more exactly we care about the case when we have a,b,c
        #   and a shares memory with b, b shares memory with c, but
        #   c does not share memory with a

        f = theano.function(
            [
                theano.In(x, mutable=True),
                theano.In(y, mutable=True),
                theano.In(z, mutable=True),
                theano.In(m1, mutable=True),
                theano.In(m2, mutable=True),
                theano.In(m3, mutable=True),
            ],
            (
                theano.tensor.dot((x * 2), m1)
                + theano.tensor.dot((y * 3), m2)
                + theano.tensor.dot((z * 4), m3)
            ),
        )

        # Compute bogus values
        v = np.asarray([1, 2, 3, 4, 5], dtype="float64")
        m = np.asarray([[1, 0], [0, 1]], dtype="float64")
        bogus_vals = f(v[:2], v[1:3], v[2:4], m, m, m)
        # Since we used inplace operation v and m may be corrupted
        # so we need to recreate them

        v = np.asarray([1, 2, 3, 4, 5], dtype="float64")
        m = np.asarray([[1, 0], [0, 1]], dtype="float64")
        m_copy1 = m.copy()
        v_copy1 = v.copy()
        m_copy2 = m.copy()
        v_copy2 = v.copy()
        vals = f(v[:2], v_copy1[1:3], v_copy2[2:4], m, m_copy1, m_copy2)

        assert np.allclose(vals, bogus_vals)
    def pretraining_functions(self, train_set_x, batch_size):
        batch_size = 1
        n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
        
        index = T.lscalar('index')  
        
        learning_rate = T.scalar('lr')  
        
        batch_begin = index * batch_size
        
        batch_end = batch_begin + batch_size
        
        pretrain_fns = []
        z_outs = []

        for ae in self.AE_layers:

            cost,updates,z = ae.get_cost_updates(learning_rate)
            fn = theano.function( inputs=[
                index, 
                theano.In(learning_rate, value=0.1)
                ],
            outputs=cost,
            updates=updates, 
            givens = {self.x: train_set_x[batch_begin: batch_end]})
            pretrain_fns.append(fn)





            z_out = ae.get_reconstructed_input( self.sigmoid_layers[-1].output )
            fn2 = theano.function( inputs=[self.sigmoid_layers[-1].output],
                outputs=z_out,
                on_unused_input='ignore',
                givens = {self.x: train_set_x[batch_begin: batch_end]}
                )
            z_outs.append(fn2)


        return pretrain_fns, z_outs
Example #8
0
 def __make_train_function(self):
     if not hasattr(self, 'train_function'):
         raise Exception('Model should be compiled before training')
     if self.train_function is None:
         print >> sys.stderr, 'Compile training function'
         input_vars = list()
         input_vars.extend(self.inputs)
         input_vars.append(theano.In(self.is_training, value=1))
         if isinstance(self.outputs, dict):
             output_vars = dict()
             output_vars['loss'] = self.cost
             output_vars.update(self.outputs)
         else:
             output_vars = list()
             output_vars.append(self.cost)
             output_vars.extend(self.outputs)
         self.train_function = theano.function(input_vars,
                                               output_vars,
                                               updates=self.updates,
                                               on_unused_input='ignore')
     return self.train_function
Example #9
0
    def __init__(self, tt_input, tt_output, updates=None, name='Unnamed Function',
                 borrow_inp=False, borrow_out=False, profile_execution=False):
        self.name = name
        self.func = None
        self.profile = profile_execution
        self.last_exec_time = None
        self.updates = updates
        if borrow_inp:
            tt_input = [theano.In(x, borrow=True) for x in tt_input]

        self.tt_input = tt_input

        self.single_return = False
        if not isinstance(tt_output, (list, tuple)):
            tt_output = [tt_output,]
            self.single_return = True

        if borrow_out:
            tt_output = [theano.Out(x, borrow=True) for x in tt_output]

        self.tt_output = tt_output
Example #10
0
File: aae.py Project: davidath/aae
def reconstruction_loss(layer_dict):
    # Symbolic var for learning rate
    lr = T.scalar('lr')
    # Symbolic input variable
    input_var = T.fmatrix('input_var')
    # Symbolic mini batch variable
    batch = T.fmatrix('batch')
    # Get reconstructed input from AE
    reconstruction = ll.get_output(
        layer_dict['AAE_Output'], input_var, deterministic=False)
    # MSE between real input and reconstructed input
    recon_loss = T.mean(T.mean(T.sqr(input_var - reconstruction), axis=1))
    # Update trainable parameters of AE
    recon_params = ll.get_all_params(layer_dict['AAE_Output'], trainable=True)
    recon_updates = lasagne.updates.nesterov_momentum(
        recon_loss, recon_params, learning_rate=lr, momentum=0.9)
    # Reconstruction loss a.k.a Lrecon
    recon_func = theano.function(inputs=[theano.In(batch), lr],
                                 outputs=recon_loss, updates=recon_updates,
                                 givens={input_var: batch}
                                 )
    return recon_func
Example #11
0
    def pretraining_functions(self, train_set_x, train_set_y, batch_size, k):
        index = T.lscalar('index')  # index to a minibatch
        learning_rate = T.scalar('lr')  # learning rate to use

        pt_learning_rate = theano.shared(value=np.asarray(
            0.1, dtype=theano.config.floatX),
                                         borrow=True)

        update_ptlr = theano.function(inputs=[],
                                      outputs=pt_learning_rate,
                                      updates={
                                          pt_learning_rate:
                                          T.clip(pt_learning_rate * 0.999,
                                                 0.1 / batch_size * 0.01, 1)
                                      })

        cost, updates = self.get_cost_updates(learning_rate,
                                              persistent=None,
                                              k=k)

        fn = theano.function(
            inputs=[
                index,
                theano.In(learning_rate, value=pt_learning_rate.get_value())
            ],
            outputs=cost,
            updates=updates,
            givens={
                self.input:
                train_set_y[(index * batch_size):(index * batch_size +
                                                  batch_size)],
                self.input_context:
                train_set_x[(index * batch_size):(index * batch_size +
                                                  batch_size)]
            })

        return fn, update_ptlr
Example #12
0
    def pretrain_setup(self, train_set_x, batch_size, k):

        index = T.lscalar('index')  # index to a minibatch
        learning_rate = T.scalar('lr')  # learning rate to use

        # number of batches
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        # begining of a batch, given `index`
        batch_begin = index * int(batch_size/4)
        # ending of a batch given `index`
        batch_end = batch_begin + int(batch_size/4)


        pretrain_fns = []
        for rbm in self.rbm_layers:
            # get the cost and the updates list
            # using CD-k here (persisent=None) for training each RBM.
            # TODO: change cost function to reconstruction error
            persistent_chain = theano.shared(numpy.zeros((batch_size,
                                                          rbm.n_hidden),
                                                         dtype=theano.config.floatX))
            cost, updates = rbm.get_cost_updates(learning_rate,
                                                 persistent=None, k=k) #persisetnet=None
            # compile the theano function
            fn = theano.function(
                inputs=[index, theano.In(learning_rate, value=0.1)],
                outputs=cost,
                updates=updates,
                givens={
                    self.x: train_set_x[batch_begin:batch_end]
                }
            )
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns
Example #13
0
 def _buildOptimizationFunction(self, X, n_steps, plr):
     mu_0,logcov_0 = self._inference(X)
     optdict = {}
     _, logcov_f, elbo_final = self._optimizeVariationalParams(X, mu_0, logcov_0, n_steps, plr,
                                                                           savedict = optdict)
     diff_elbo, _ = self._estimateELBOEntropy(optdict['elbo_its'][0],optdict['elbo_its'][-1], logcov_0, logcov_f)
     self.optimize_mu_logcov = theano.function([X, theano.In(n_steps, value=self.params['n_steps'], name='n_steps'),
                                              theano.In(plr, value=self.params['param_lr'], name='plr')],
                                                [optdict['elbo_its'], optdict['gradnorm_mu_its'],
                                                 optdict['gradnorm_logcov_its'],optdict['elbo_its'].shape[0], diff_elbo], 
                                              name = 'Optimize ELBO wrt mu/cov')
     diff_elbo, _ = self._estimateELBOEntropy(optdict['elbo_its'][0], optdict['elbo_its'][-1], logcov_0, logcov_f)
     self.final_elbo     = theano.function([X, theano.In(n_steps, value=self.params['n_steps'], name='n_steps'),
                                              theano.In(plr, value=self.params['param_lr'], name='plr')],
                                            [optdict['elbo_its'][0],optdict['elbo_its'][-1], optdict['elbo_its'].shape[0],
                                            optdict['gradnorm_mu_its'][-1],optdict['gradnorm_logcov_its'][-1], 
                                            diff_elbo], name = 'Optimize ELBO wrt mu/cov')
     self.init_final_params = theano.function([X, theano.In(n_steps, value=self.params['n_steps'], name='n_steps'),
                                              theano.In(plr, value=self.params['param_lr'], name='plr')],
                                            [optdict['mu_its'][0],optdict['logcov_its'][0], optdict['mu_its'][-1],
                                                optdict['logcov_its'][-1]], name = 'init/final params')
Example #14
0
    def __theano_build__(self):
        E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c

        x = T.ivector('x')
        y = T.ivector('y')

        def forward_prop_step(x_t, s_prev1, s_prev2, s_prev3):
            # Embedding layer
            x_e = E[:, x_t]

            def GRU(i, U, W, b, x_0, s_previous):
                z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) +
                                        W[i * 3 + 0].dot(s_previous) +
                                        b[i * 3])
                r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) +
                                        W[i * 3 + 1].dot(s_previous) +
                                        b[i * 3 + 1])
                s_candidate = T.tanh(U[i * 3 + 2].dot(x_0) +
                                     W[i * 3 + 2].dot(s_previous * r) +
                                     b[i * 3 + 2])

                return (T.ones_like(z) - z) * s_candidate + z * s_previous

            # GRU Layer 1
            s1 = GRU(0, U, W, b, x_e, s_prev1)

            # GRU Layer 2
            s2 = GRU(1, U, W, b, s1, s_prev2)

            # GRU Layer 3
            s3 = GRU(2, U, W, b, s2, s_prev3)

            # Final output calculation
            o_t = T.nnet.softmax(V.dot(s3) + c)[0]

            return [o_t, s1, s2, s3]

        x_e = E[:, x_t]

        [o, s1, s2,
         s3], updates = theano.scan(forward_prop_step,
                                    sequences=x,
                                    truncate_gradient=self.bptt_truncate,
                                    outputs_info=[
                                        None,
                                        dict(initial=T.zeros(self.hidden_dim)),
                                        dict(initial=T.zeros(self.hidden_dim)),
                                        dict(initial=T.zeros(self.hidden_dim))
                                    ])

        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
        # Total cost
        cost = o_error

        # Gradients
        dE = T.grad(cost, E)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)

        # Assign functions
        self.predict = theano.function([x], [o], allow_input_downcast=True)
        self.predict_class = theano.function([x],
                                             prediction,
                                             allow_input_downcast=True)
        self.ce_error = theano.function([x, y],
                                        cost,
                                        allow_input_downcast=True)
        self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc],
                                    allow_input_downcast=True)

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        mE = decay * self.mE + (1 - decay) * dE**2
        mU = decay * self.mU + (1 - decay) * dU**2
        mW = decay * self.mW + (1 - decay) * dW**2
        mV = decay * self.mV + (1 - decay) * dV**2
        mb = decay * self.mb + (1 - decay) * db**2
        mc = decay * self.mc + (1 - decay) * dc**2

        self.sgd_step = theano.function(
            [x, y, learning_rate,
             theano.In(decay, value=0.9)], [],
            updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
                     (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                     (self.mE, mE), (self.mU, mU), (self.mW, mW),
                     (self.mV, mV), (self.mb, mb), (self.mc, mc)],
            allow_input_downcast=True)
Example #15
0
cache_mF = decay * mF + (1 - decay) * dF**2
cache_md = decay * md + (1 - decay) * dd**2

# RNN rmsprop cache updates.
cache_mU_1 = decay * mU_1 + (1 - decay) * dU_1**2
cache_mU_2 = decay * mU_2 + (1 - decay) * dU_2**2
cache_mW = decay * mW + (1 - decay) * dW**2
cache_mV = decay * mV + (1 - decay) * dV**2
cache_mb = decay * mb + (1 - decay) * db**2
cache_mc = decay * mc + (1 - decay) * dc**2
cache_mh0_l1 = decay * mh0_l1 + (1 - decay) * dh0_l1**2
cache_mh0_l2 = decay * mh0_l2 + (1 - decay) * dh0_l2**2

sgd_step = theano.function(
    [x, sentences, y, learning_rate,
     theano.In(decay, value=0.9)], [],
    updates=[(U_1, U_1 - learning_rate * dU_1 / T.sqrt(mU_1 + 1e-6)),
             (U_2, U_2 - learning_rate * dU_2 / T.sqrt(mU_2 + 1e-6)),
             (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
             (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
             (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
             (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
             (F, F - learning_rate * dF / T.sqrt(mF + 1e-6)),
             (d, d - learning_rate * dd / T.sqrt(md + 1e-6)),
             (h0_l1, h0_l1 - learning_rate * dh0_l1 / T.sqrt(mh0_l1 + 1e-6)),
             (h0_l2, h0_l2 - learning_rate * dh0_l2 / T.sqrt(mh0_l2 + 1e-6)),
             (mU_1, cache_mU_1), (mU_2, cache_mU_2), (mW, cache_mW),
             (mV, cache_mV), (mb, cache_mb), (mc, cache_mc), (mF, cache_mF),
             (md, cache_md), (mh0_l1, cache_mh0_l1), (mh0_l2, cache_mh0_l2)])

sgd_step(X[0], test_text, Y[0], LEARNING_RATE)
Example #16
0
    def build_train_func(self,
                         solver_mode="sgd",
                         cost_factors=[],
                         use_acc_mode=False,
                         skip_build=False):

        #arguments to function
        logging.info(
            "Building training functions - solver: %s, use_acc_mode: %s" %
            (solver_mode, use_acc_mode))
        iteration = tensor.fscalar()
        learn_rate = tensor.fscalar()
        momentum = tensor.fvector()
        decay = tensor.fscalar()

        #find costs
        self.yt = []
        self.cost_list = []
        self.cost_layers = []
        self.cost_layer_names = []
        for layer in self.layers:
            yt_index = tensor.lvector("target index %i" %
                                      len(self.cost_layers))
            yt_value = tensor.fvector("target value %i" %
                                      len(self.cost_layers))
            cost = layer.cost(yt_index, yt_value)
            if not cost is None:
                self.yt += [yt_index, yt_value]
                self.cost_list.append(cost)
                self.cost_layers.append(layer)
                self.cost_layer_names.append(layer.type_name)

        self.cost_factors = [1.0] * len(self.cost_list) if len(
            cost_factors) == 0 else cost_factors
        assert len(self.cost_factors) == len(
            self.cost_list
        ), "Different number of cost factors (%i) and cost layers (%i)" % (len(
            self.cost_factors), len(self.cost_layers))
        logging.info("Found %i costs in model:" % len(self.cost_layers),
                     list(zip(self.cost_layer_names, self.cost_factors)))

        self.train_cost = tensor.as_tensor_variable(0)
        for i, cost in enumerate(self.cost_list):
            self.train_cost += self.cost_factors[i] * cost

        if self.gradient_clip > 0.0:
            logging.info("Clipping gradient to [%f,%f]" %
                         (-self.gradient_clip, self.gradient_clip))
            self.train_cost = theano.gradient.grad_clip(
                self.train_cost, -self.gradient_clip, self.gradient_clip)

        #find split points
        split_points = [0]
        self.use_split_mode = False
        for index, layer in enumerate(self.layers):
            if layer.has_split:
                self.use_split_mode = True
                split_points.append(index)
        split_points.append(len(self.layers))

        if self.use_split_mode:
            logging.verbose("Using split mode with split points:",
                            split_points)
            self.func["train_fwd"] = []
            self.func["train_bwd"] = []

        self.updates = []
        for sp in range(len(split_points) - 1):

            logging.info("Building training functions for layers %i-%i" %
                         (split_points[sp], split_points[sp + 1]))

            split_start = self.layers[split_points[sp]] if sp > 0 else None
            split_end = self.layers[split_points[sp + 1]] if (
                sp + 2) < len(split_points) else None
            split_cost = self.train_cost if split_end is None else None
            split_layers = []
            for i, layer in enumerate(self.layers):
                if (i > split_points[sp]) and (i < split_points[sp + 1]):
                    split_layers.append(layer)

            #determine known_grads provided by previous backward passes
            from collections import OrderedDict
            split_known_grads = OrderedDict()
            for i in range(sp + 1, len(split_points) - 1):
                split_known_grads.update(
                    self.layers[split_points[i]].split_known_grads())

            if len(split_known_grads) == 0:
                split_known_grads = None

            # print(split_known_grads)
            # print(split_known_grads)
            # print(sp+1, len(split_points)-1)

            #
            def get_sgd_updates(p, g):
                m = theano.shared(numpy.zeros(p.shape.eval(),
                                              dtype=theano.config.floatX),
                                  broadcastable=p.broadcastable,
                                  borrow=True)
                rho = tensor.switch(tensor.gt(iteration, 0), momentum[0], 0.0)
                m_update = rho * m + (1.0 - rho) * g
                p_update = p - learn_rate * m_update
                return [(p, p_update), (m, m_update)]

            def get_torch_updates(p, g):
                m = theano.shared(numpy.zeros(p.shape.eval(),
                                              dtype=theano.config.floatX),
                                  broadcastable=p.broadcastable,
                                  borrow=True)
                rho = tensor.switch(tensor.gt(iteration, 0), momentum[0], 0.0)
                m_update = rho * m + g
                p_update = p - learn_rate * (g + momentum[0] * m_update)
                return [(p, p_update), (m, m_update)]

            def get_adam_updates(p, g):
                eps = 1e-8
                m = theano.shared(numpy.zeros(p.shape.eval(),
                                              dtype=theano.config.floatX),
                                  broadcastable=p.broadcastable,
                                  borrow=True)
                v = theano.shared(numpy.zeros(p.shape.eval(),
                                              dtype=theano.config.floatX),
                                  broadcastable=p.broadcastable,
                                  borrow=True)
                m_update = momentum[0] * m + (1.0 - momentum[0]) * g
                v_update = momentum[1] * v + (1.0 - momentum[1]) * (g * g)
                m_hat = m_update / (1.0 -
                                    tensor.pow(momentum[0], iteration + 1))
                v_hat = v_update / (1.0 -
                                    tensor.pow(momentum[1], iteration + 1))
                p_update = p - learn_rate * m_hat / (tensor.sqrt(v_hat) + eps)
                return [(p, p_update), (m, m_update), (v, v_update)]

            #append parameter updates
            params = []
            params_decay = []
            for layer in split_layers:
                params += layer.weights()
                params_decay += [True] * len(layer.weights())
                params += layer.biases()
                params_decay += [False] * len(layer.biases())

            #build updates
            print("known grads:", split_known_grads)
            grads = tensor.grad(split_cost,
                                params,
                                known_grads=split_known_grads)
            solver_updates = []
            for p, g, p_decay in zip(params, grads, params_decay):

                #add L2 weight decay if needed
                if p_decay or self.bias_decay:
                    g += decay * p

                if solver_mode == "adam":
                    solver_updates += get_adam_updates(p, g)
                elif solver_mode == "torch" or solver_mode == "nesterov":
                    solver_updates += get_torch_updates(p, g)
                else:
                    solver_updates += get_sgd_updates(p, g)

            #append per layer updates
            local_updates = solver_updates + sum(
                [layer.updates(self.train_cost) for layer in split_layers], [])

            #all updates
            self.updates += local_updates

            #skipping actual theano function building (if you just want updates, etc)
            if skip_build:
                continue

            global debug_train
            if debug_train:
                logging.warning("WARNING: Debug mode is active!")
                from theano.compile.nanguardmode import NanGuardMode
                debug_mode = theano.compile.MonitorMode(
                    post_func=debug_detect_errors)
            else:
                debug_mode = None

            if self.use_split_mode:

                if not split_end is None:
                    updates = sum(
                        [layer.split_forward() for layer in split_layers], [])
                    updates += split_end.split_forward()

                    print("fwd updates:", updates)
                    f = theano.function([self.input], [],
                                        updates=updates,
                                        givens=[(denet.layer.get_train(),
                                                 tensor.cast(1, 'int8'))],
                                        on_unused_input='ignore',
                                        mode=debug_mode)
                    self.func["train_fwd"].append(f)

                outputs = ([self.train_cost] +
                           self.cost_list) if split_end is None else []
                updates = sum([
                    layer.split_backward(split_cost, split_known_grads)
                    for layer in split_layers
                ], [])
                if not split_start is None:
                    updates += split_start.split_backward(
                        split_cost, split_known_grads)

                print("bwd updates:", updates)
                updates += local_updates
                f = theano.function([
                    denet.layer.get_epoch(), iteration, learn_rate, momentum,
                    decay, self.input
                ] + self.yt,
                                    outputs,
                                    updates=updates,
                                    givens=[(denet.layer.get_train(),
                                             tensor.cast(1, 'int8'))],
                                    on_unused_input='ignore',
                                    mode=debug_mode)
                self.func["train_bwd"].insert(0, f)

            elif use_acc_mode:
                acc_counter = theano.shared(
                    numpy.array(0, dtype=theano.config.floatX))
                begin_updates = [(acc_counter, tensor.zeros_like(acc_counter))]
                step_updates = [(acc_counter, acc_counter + 1)]
                end_updates = []
                self.acc_params = []
                for p_dest, p_src in self.updates:
                    p_acc = theano.shared(numpy.zeros(
                        p_dest.shape.eval(), dtype=theano.config.floatX),
                                          broadcastable=p_dest.broadcastable,
                                          borrow=True)
                    begin_updates.append((p_acc, tensor.zeros_like(p_acc)))
                    step_updates.append((p_acc, p_acc + p_src))
                    end_updates.append((p_dest, p_acc / acc_counter))
                    self.acc_params.append(p_acc)

                logging.info(
                    "Constructing parameter accumulate update functions (solver=%s)"
                    % solver_mode)
                self.func["train_begin"] = theano.function(
                    [], [], updates=begin_updates)
                self.func["train_step"] = theano.function(
                    [
                        denet.layer.get_epoch(), iteration, learn_rate,
                        momentum, decay, self.input
                    ] + self.yt, [self.train_cost] + self.cost_list,
                    updates=step_updates,
                    givens=[(denet.layer.get_train(), tensor.cast(1, 'int8'))],
                    on_unused_input='ignore',
                    allow_input_downcast=True,
                    mode=debug_mode)
                self.func["train_end"] = theano.function([], [],
                                                         updates=end_updates)
            else:
                logging.info(
                    "Constructing parameter update function (solver=%s)" %
                    solver_mode)

                #making
                f_input = theano.In(self.input, borrow=True)
                f_yt = [theano.In(yt, borrow=True) for yt in self.yt]
                self.func["train_step"] = theano.function(
                    [
                        denet.layer.get_epoch(), iteration, learn_rate,
                        momentum, decay, f_input
                    ] + f_yt, [self.train_cost] + self.cost_list,
                    updates=self.updates,
                    givens=[(denet.layer.get_train(), tensor.cast(1, 'int8'))],
                    on_unused_input='ignore',
                    allow_input_downcast=True,
                    mode=debug_mode)

                logging.verbose("Exporting graph...")
                with open("graph.txt", "w") as f:
                    theano.printing.debugprint(self.func["train_step"],
                                               file=f,
                                               print_type=True)
Example #17
0
    def __theano_build__(self):
        E, V, U, W, b, c, embedded = self.E, self.V, self.U, self.W, self.b, self.c, self.embedded

        x = T.ivector('x')
        y = T.ivector('y')

        def forward_prop_step(x_t, s_t1_prev, c_t1_prev, s_t2_prev, c_t2_prev):
            # This is how we calculated the hidden state in a simple RNN. No longer!
            # s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev))

            # Word embedding layer
            x_e = E[:, x_t]

            # LSTM Layer
            i_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) +
                                       b[0])
            f_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) +
                                       b[1])
            o_t1 = T.nnet.hard_sigmoid(U[2].dot(x_e) + W[2].dot(s_t1_prev) +
                                       b[2])
            g_t1 = T.tanh(U[3].dot(x_e) + W[3].dot(s_t1_prev) + b[3])
            c_t1 = c_t1_prev * f_t1 + g_t1 * i_t1
            s_t1 = T.tanh(c_t1) * o_t1

            i_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) +
                                       b[4])
            f_t2 = T.nnet.hard_sigmoid(U[5].dot(s_t1) + W[5].dot(s_t2_prev) +
                                       b[5])
            o_t2 = T.nnet.hard_sigmoid(U[6].dot(s_t1) + W[6].dot(s_t2_prev) +
                                       b[6])
            g_t2 = T.tanh(U[7].dot(s_t1) + W[7].dot(s_t2_prev) + b[7])
            c_t2 = c_t2_prev * f_t2 + g_t2 * i_t2
            s_t2 = T.tanh(c_t2) * o_t2

            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            o_t = T.nnet.softmax(V.dot(s_t2) + c)[0]

            return [o_t, s_t1, c_t1, s_t2, c_t2]

        [o, s1, cm1, s2, cm2
         ], updates = theano.scan(forward_prop_step,
                                  sequences=x,
                                  truncate_gradient=self.bptt_truncate,
                                  outputs_info=[
                                      None,
                                      dict(initial=T.zeros(self.hidden_dim)),
                                      dict(initial=T.zeros(self.hidden_dim)),
                                      dict(initial=T.zeros(self.hidden_dim)),
                                      dict(initial=T.zeros(self.hidden_dim)),
                                  ])

        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o, y))

        # Total cost (could add regularization here)
        cost = o_error

        # Gradients
        dE = T.grad(cost, E)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)

        # Assign functions
        self.predict = theano.function([x], o)
        self.predict_class = theano.function([x], prediction)
        self.ce_error = theano.function([x, y], cost)
        if not embedded:
            self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc])
        else:
            self.bptt = theano.function([x, y], [dU, dW, db, dV, dc])

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        mE = decay * self.mE + (1 - decay) * dE**2
        mU = decay * self.mU + (1 - decay) * dU**2
        mW = decay * self.mW + (1 - decay) * dW**2
        mV = decay * self.mV + (1 - decay) * dV**2
        mb = decay * self.mb + (1 - decay) * db**2
        mc = decay * self.mc + (1 - decay) * dc**2

        if not embedded:
            self.sgd_step = theano.function(
                [x, y, learning_rate,
                 theano.In(decay, value=0.9)], [],
                updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
                         (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                         (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                         (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                         (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                         (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                         (self.mE, mE), (self.mU, mU), (self.mW, mW),
                         (self.mV, mV), (self.mb, mb), (self.mc, mc)])
        else:
            self.sgd_step = theano.function(
                [x, y, learning_rate,
                 theano.In(decay, value=0.9)], [],
                updates=[(U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                         (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                         (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                         (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                         (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                         (self.mU, mU), (self.mW, mW), (self.mV, mV),
                         (self.mb, mb), (self.mc, mc)])
Example #18
0
    print("curvLength test: ", curvLength(one, 0.0, 1.0))
    print(quad(lambda x: fLength(one, x), 0.0, 1.0))

x = theano.tensor.dscalar()
y = theano.tensor.dscalar()
h = (5000 - 0.005 * (x * x + y * y + x * y) + 12.5 *
     (x + y)) * theano.tensor.exp(-abs(0.000001 * (x * x + y * y) - 0.0015 *
                                       (x + y) + 0.7))
fh = theano.function([x, y], h)
gradH = theano.gradient.grad(h, [x, y])
gradHX = theano.function([x, y], gradH[0])
gradHY = theano.function([x, y], gradH[1])
if DEBUG:
    print(fh(0, 0), gradHX(0, 0), gradHY(0, 0), x.dtype)
fhX0 = theano.function([y, theano.In(x, value=0)], h)
gradHX0Y = theano.function([y, theano.In(x, value=0)], gradH[1])

eps = 1e-12
left = 0.0
right = 1600.0
while True:
    mid = (left + right) / 2
    if mid == left or mid == right:
        break
    if gradHX0Y(mid) > 0:
        left = mid
    else:
        right = mid

y0 = left
Example #19
0
    def _create_iter_funcs(self, layers, objective, update, output_type):
        y_batch = output_type('y_batch')

        objective_kw = self._get_params_for('objective')

        loss_train = objective(layers, target=y_batch, **objective_kw)
        loss_eval = objective(layers,
                              target=y_batch,
                              deterministic=True,
                              **objective_kw)

        output_layer = self._output_layers
        predict_proba = get_output(output_layer, None, deterministic=True)
        if not self.regression:
            predict = predict_proba[0].argmax(axis=1)
            accuracy = T.mean(T.eq(predict, y_batch))
        else:
            accuracy = loss_eval

        scores_train = [
            s[1](predict_proba, y_batch) for s in self.scores_train
        ]
        scores_valid = [
            s[1](predict_proba, y_batch) for s in self.scores_valid
        ]

        all_params = self.get_all_params(trainable=True)
        grads = theano.grad(loss_train, all_params)
        for idx, param in enumerate(all_params):
            grad_scale = getattr(param.tag, 'grad_scale', 1)
            if grad_scale != 1:
                grads[idx] *= grad_scale
        update_params = self._get_params_for('update')
        updates = update(grads, all_params, **update_params)

        input_layers = [
            layer for layer in layers.values()
            if isinstance(layer, InputLayer)
        ]

        X_inputs = [
            theano.In(input_layer.input_var, name=input_layer.name)
            for input_layer in input_layers
        ]
        inputs = X_inputs + [theano.In(y_batch, name="y")]

        train_iter = theano.function(
            inputs=inputs,
            outputs=[loss_train] + scores_train,
            updates=updates,
            allow_input_downcast=True,
        )
        eval_iter = theano.function(
            inputs=inputs,
            outputs=[loss_eval, accuracy] + scores_valid,
            allow_input_downcast=True,
        )
        predict_iter = theano.function(
            inputs=X_inputs,
            outputs=predict_proba,
            allow_input_downcast=True,
        )

        return train_iter, eval_iter, predict_iter
Example #20
0
out_2 = (1 + T.tanh(x / 2)) / 2

logistic = function([x], out)
logistic_2 = function([x], out_2)

a, b = T.dmatrices('a', 'b')
diff = a - b
abs_diff = abs(diff)
diff_squared = diff**2
f = function([a, b], [diff, diff_squared, abs_diff])

# setting a default value for an argument
x, y, w = T.dscalars('x', 'y', 'w')
z = (x + y) * w
f = function(
    [x, theano.In(y, value=1),
     theano.In(w, value=2, name='w_by_name')], z)
# print(f(33))
# print(f(33, w_by_name = 10, y = 2))

# Using shared Variables
state = shared(0)
inc = T.iscalar('inc')
accumulator = function([inc], state, updates=[(state, state + inc)])
decrementor = function([inc], state, updates=[(state, state - inc)])

fn_of_state = state * 2 + inc
foo = T.scalar(dtype=state.dtype)

skip_shared = function([inc, foo], fn_of_state, givens=[(state, foo)])
skip_shared(1, 3)
Example #21
0
def train(args, trial=11, no_valid=False):
    # Creating unique strings to save for experiments.
    data_valid = "data/"+args.data_name+"_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\
    "_transitions_"+str(args.transitions)
    data_test = data_valid.replace("_valid_size", "_test_size")
    # If we want validation set to match modData of test set
    if modDataValid == 1:
        data_valid = data_valid.replace("_trial_", "_" + modData + "_trial_")
        data_test = data_test.replace("_trial_", "_" + modData + "_trial_")

    # By default, it is m0
    data_train = "data/"+args.data_name+"_trial_"+str(trial)+"_train_size_"+str(args.train_size)+\
    "_transitions_"+str(args.transitions)

    subStr = "rnn_type_"+args.rnn_type + "_trial_"+str(trial) + "_hiddenSize_"+str(args.hidden_size)+\
    "_numLayers_"+str(args.num_layers)+ \
    "_dropout_"+str(args.dropout)+"_train_size_"+str(args.train_size) + "_transitions_"+str(args.transitions)+\
    "_novalid_"+str(args.no_valid)

    if modData == "m1":
        data_train = data_train.replace("_trial_", "_m1_trial_")
        subStr = subStr.replace("_trial_", "_m1_trial_")
    elif modData == "m3":
        data_train = data_train.replace("_trial_", "_m3_trial_")
        subStr = subStr.replace("_trial_", "_m3_trial_")

        data_valid = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\
        "_transitions_"+str(args.transitions)
        data_test = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_test_size_"+str(args.train_size)+\
        "_transitions_"+str(args.transitions)

    print("on test: " + subStr)
    # Perform folder prefixing
    prefix_path = models_folder + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\
    "_boost_"+bStr(args.boosting)

    load_path2 = prefix + load_path
    save_path2 = prefix + save_path
    last_path2 = prefix + last_path

    plots_output2 = plots_output + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\
    "_boost_"+bStr(args.boosting)

    # obtain vocabulary size
    ix_to_char, char_to_ix, vocab_size = get_metadata(
        data_test.replace("_test", ""))
    print("vocab_size: " + str(vocab_size))

    # Get train, valid, test streams
    sharedDataTrain, train_stream = get_stream_inGPU(data_train,
                                                     sharedName='sharedData')
    train_streamCopy = copy.deepcopy(train_stream)
    sharedDataValid, dev_stream = get_stream_inGPU(data_valid,
                                                   sharedName='sharedData')
    valid_streamCopy = copy.deepcopy(dev_stream)
    sharedDataTest, test_stream = get_stream_inGPU(data_test,
                                                   sharedName='sharedData')
    test_streamCopy = copy.deepcopy(test_stream)

    # Create dummy sums
    sharedMRRSUM = shared(np.array(0.0, dtype=theano.config.floatX))
    sharedTOTSUM = shared(np.array(0.0, dtype=theano.config.floatX))
    sharedSUMVARs = {
        'sharedMRRSUM': sharedMRRSUM,
        'sharedTOTSUM': sharedTOTSUM
    }

    # Initialize batches
    batch_index_From = T.scalar('int_stream_From', dtype='int32')
    batch_index_To = T.scalar('int_stream_To', dtype='int32')

    # Index theano variables
    x = sharedDataTrain['x'][:, batch_index_From:batch_index_To]
    x.name = 'x'

    x_mask = sharedDataTrain['x_mask'][:, batch_index_From:batch_index_To]
    x_mask.name = 'x_mask'

    x_mask_o = sharedDataTrain['x_mask_o'][:, batch_index_From:batch_index_To]
    x_mask_o.name = 'x_mask_o'

    x_mask_o_mask = sharedDataTrain[
        'x_mask_o_mask'][:, batch_index_From:batch_index_To]
    x_mask_o_mask.name = 'x_mask_o_mask'

    y = sharedDataTrain['y'][:, batch_index_From:batch_index_To]
    y.name = 'y'

    y_mask = sharedDataTrain['y_mask'][:, batch_index_From:batch_index_To]
    y_mask.name = 'y_mask'

    y_mask_o = sharedDataTrain['y_mask_o'][:, batch_index_From:batch_index_To]
    y_mask_o.name = 'y_mask_o'

    y_mask_o_mask = sharedDataTrain[
        'y_mask_o_mask'][:, batch_index_From:batch_index_To]
    y_mask_o_mask.name = 'y_mask_o_mask'

    lens = sharedDataTrain['lens'][:, batch_index_From:batch_index_To]
    lens.name = 'lens'

    # Generate temp shared vars
    tempSharedData = {}
    tempSharedData[theano.config.floatX] = [
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX))
    ]

    tempSharedData['uint8'] = [
        shared(np.array([[0], [0]], dtype='uint8')),
        shared(np.array([[0], [0]], dtype='uint8')),
        shared(np.array([[0], [0]], dtype='uint8'))
    ]

    # Final mask is due to the generated mask and the input mask
    x_mask_final = x_mask * x_mask_o * x_mask_o_mask
    y_mask_final = y_mask * y_mask_o * y_mask_o_mask

    # Build neural network
    linear_output, cost = nn_fprop(
        x,
        x_mask_final,
        y,
        y_mask_final,
        lens,
        vocab_size,
        hidden_size,
        num_layers,
        rnn_type,
        boosting=boosting,
        scan_kwargs={'truncate_gradient': truncate_gradient})

    # Keep a constant in gpu memory
    constant1 = shared(np.float32(1.0))
    cost_int, ymasksum = RR_cost(y, linear_output, y_mask_final, constant1)

    # Validation calculations
    fRR = function(inputs=[
        theano.In(batch_index_From, borrow=True),
        theano.In(batch_index_To, borrow=True)
    ],
                   updates=[(sharedMRRSUM, sharedMRRSUM + cost_int),
                            (sharedTOTSUM, sharedTOTSUM + ymasksum)])

    # COST
    cg = ComputationGraph(cost)

    if dropout > 0:
        # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015)
        inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(
            cg.variables)
        cg = apply_dropout(cg, inputs, dropout)
        cost = cg.outputs[0]

    # Learning algorithm
    step_rules = [
        RMSProp(learning_rate=rmsPropLearnRate, decay_rate=decay_rate),
        StepClipping(step_clipping)
    ]
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=CompositeRule(step_rules))

    # Extensions

    # This is for tracking our best result
    trackbest = track_best('valid_MRR', save_path2, last_path2, num_epochs,
                           nepochs, maxIterations, epsilon, tempSharedData)

    if onlyPlots:
        prefixes = ["train_cross", "valid_cross", "test_cross"]
        gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
        step_norm = aggregation.mean(algorithm.total_step_norm)
        monitored_vars = [cost, gradient_norm, step_norm]
        #this is faster
        train_monitor = myTrainingDataMonitoring(
            variables=monitored_vars,
            prefix=prefixes[0],
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        #train_monitor = DataStreamMonitoringPlot(variables=[cost],
        #                    data_stream=train_streamCopy, prefix=prefixes[0], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTrain, after_batch=True, saveEveryXIteration = saveEveryXIteration)
        valid_monitor = DataStreamMonitoringPlot(
            variables=[cost],
            data_stream=valid_streamCopy,
            prefix=prefixes[1],
            sharedDataTrain=sharedDataTrain,
            sharedDataActualTest=sharedDataValid,
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        test_monitor = DataStreamMonitoringPlot(
            variables=[cost],
            data_stream=test_streamCopy,
            prefix=prefixes[2],
            sharedDataTrain=sharedDataTrain,
            sharedDataActualTest=sharedDataTest,
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        trackbest = [trackbest[0], trackbest[2], trackbest[3], trackbest[4]]
        plot = Plot('Live Plotting',
                    saveFolder=plots_output2,
                    channels=[
                        'train_cross_cost', 'valid_cross_cost',
                        'test_cross_cost'
                    ],
                    numProcesses=numProcesses,
                    saveEveryXIteration=saveEveryXIteration,
                    after_batch=True)
        extensions = [
            train_monitor,
            valid_monitor,
            test_monitor,
            plot,
            Printing(),
            ProgressBar(),
        ] + trackbest
    else:
        dev_monitor = myDataStreamMonitoring(after_epoch=True,
                                             before_epoch=False,
                                             data_stream=dev_stream,
                                             prefix="valid",
                                             fRR=fRR,
                                             sharedVars=sharedSUMVARs,
                                             sharedDataTrain=sharedDataTrain,
                                             sharedDataValid=sharedDataValid)
        extensions = [
            dev_monitor,
            Printing(),
            ProgressBar(),
        ] + trackbest

    if learning_rate_decay not in (0, 1):
        extensions.append(
            SharedVariableModifier(step_rules[0].learning_rate,
                                   lambda n, lr: np.cast[theano.config.floatX]
                                   (learning_rate_decay * lr),
                                   after_epoch=True,
                                   after_batch=False))

    print 'number of parameters in the model: ' + str(
        T.sum([p.size for p in cg.parameters]).eval())
    # Finally build the main loop and train the model
    main_loop = MainLoop(data_stream=train_stream,
                         algorithm=algorithm,
                         model=Model(cost),
                         extensions=extensions)
    main_loop.run()
Example #22
0
vU2_upd = beta2 * vU2 + (1 - beta2) * dU2**2
vW1_upd = beta2 * vW1 + (1 - beta2) * dW1**2
vW2_upd = beta2 * vW2 + (1 - beta2) * dW2**2
vb1_upd = beta2 * vb1 + (1 - beta2) * db1**2
vb2_upd = beta2 * vb2 + (1 - beta2) * db2**2
vV_upd = beta2 * vV + (1 - beta2) * dV**2
vc_upd = beta2 * vc + (1 - beta2) * dc**2

learning_rate_upd = learning_rate * T.cast(T.sqrt(
    (1 - beta2**t_upd) / (1 - beta1**t_upd)),
                                           dtype='float32')

apply_grads = theano.function(
    [
        x, learning_rate,
        theano.In(beta1, value=0.9),
        theano.In(beta2, value=0.99),
        theano.In(epsilon, value=1e-16)
    ], [],
    updates=[
        (U1, U1 - learning_rate_upd * mU1_upd / (T.sqrt(vU1_upd) + epsilon)),
        (U2, U2 - learning_rate_upd * mU2_upd / (T.sqrt(vU2_upd) + epsilon)),
        (W1, W1 - learning_rate_upd * mW1_upd / (T.sqrt(vW1_upd) + epsilon)),
        (W2, W2 - learning_rate_upd * mW2_upd / (T.sqrt(vW2_upd) + epsilon)),
        (b1, b1 - learning_rate_upd * mb1_upd / (T.sqrt(vb1_upd) + epsilon)),
        (b2, b2 - learning_rate_upd * mb2_upd / (T.sqrt(vb2_upd) + epsilon)),
        (V, V - learning_rate_upd * mV_upd / (T.sqrt(vV_upd) + epsilon)),
        (c, c - learning_rate_upd * mc_upd / (T.sqrt(vc_upd) + epsilon)),
        (mU1, mU1_upd), (mU2, mU2_upd), (mW1, mW1_upd), (mW2, mW2_upd),
        (mb1, mb1_upd), (mb2, mb2_upd), (mV, mV_upd), (mc, mc_upd),
        (vU1, vU1_upd), (vU2, vU2_upd), (vW1, vW1_upd), (vW2, vW2_upd),
Example #23
0
    def __theano_build(self):
        E, U, W, V, b, c = self.E, self.U, self.W, self.V, self.b, self.c
        x = T.fmatrix('x')
        y = T.fvector('y')

        #implementation of ReLU activator
        def ReLU(x):
            return T.switch(x < 0, 0, x)

        def forward_prop_step(x_t, s_prev):
            #Embedding Layer with ReLU non-linearity
            x_e = ReLU(E.dot(x_t))
            # GRU Layer 1
            z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_prev) + b[0])
            r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_prev) + b[1])
            c_t = ReLU(U[2].dot(x_e) + W[2].dot(s_prev * r_t) + b[2])
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_prev
            #prediction at time t+1
            o_t = V.dot(s_t) + c

            return [o_t, s_t]

        #feed-forward for training example.
        #initializing the hidden state with first 8 steps
        [o, s1], updates1 = theano.scan(
            forward_prop_step,
            sequences=x,
            truncate_gradient=self.bptt_truncate,
            outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))])

        #using first 8 steps to predict the future trajectory
        loss = T.dot((o[-1] - y), (o[-1] - y))

        #back-propogation through time. Truncation is handled upon calculating o.
        dE = T.grad(loss, E)
        dU = T.grad(loss, U)
        dW = T.grad(loss, W)
        db = T.grad(loss, b)
        dV = T.grad(loss, V)
        dc = T.grad(loss, c)

        #Stochastic Gradient Descent
        #sgd parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        #RMSProp updates
        mE = decay * self.mE + (1 - decay) * dE**2
        mU = decay * self.mU + (1 - decay) * dU**2
        mW = decay * self.mW + (1 - decay) * dW**2
        mV = decay * self.mV + (1 - decay) * dV**2
        mb = decay * self.mb + (1 - decay) * db**2
        mc = decay * self.mc + (1 - decay) * dc**2

        #1e-6 gaurds against division by 0
        #gradient descent update of parameters
        self.sgd_step = theano.function(
            [x, y, learning_rate,
             theano.In(decay, value=0.9)], [],
            allow_input_downcast=True,
            updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
                     (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                     (self.mU, mU), (self.mW, mW), (self.mV, mV),
                     (self.mb, mb), (self.mc, mc)])

        self.predict = theano.function([x], o[-1], allow_input_downcast=True)
        self.loss = theano.function([x, y], loss, allow_input_downcast=True)

        def cost(X, Y):
            return (np.sum([self.loss(x, y) for x, y in zip(X, Y)])) / len(X)

        self.cost = cost
Example #24
0
mV_upd = beta1 * mV + (1 - beta1) * dV
mc_upd = beta1 * mc + (1 - beta1) * dc

vU1_upd = beta2 * vU1 + (1 - beta2) * dU1 ** 2
vU2_upd = beta2 * vU2 + (1 - beta2) * dU2 ** 2
vW1_upd = beta2 * vW1 + (1 - beta2) * dW1 ** 2
vW2_upd = beta2 * vW2 + (1 - beta2) * dW2 ** 2
vb1_upd = beta2 * vb1 + (1 - beta2) * db1 ** 2
vb2_upd = beta2 * vb2 + (1 - beta2) * db2 ** 2
vV_upd = beta2 * vV + (1 - beta2) * dV ** 2
vc_upd = beta2 * vc + (1 - beta2) * dc ** 2

learning_rate_upd = learning_rate * T.cast(T.sqrt((1 - beta2 ** t_upd) / (1 - beta1 ** t_upd)), dtype='float32')

apply_grads = theano.function(
    [x, learning_rate, theano.In(beta1, value= 0.9), theano.In(beta2, value= 0.99), 
    theano.In(epsilon, value= 1e-16)],
    [], 
    updates=[(U1, U1 - learning_rate_upd * mU1_upd / (T.sqrt(vU1_upd) + epsilon)),
             (U2, U2 - learning_rate_upd * mU2_upd / (T.sqrt(vU2_upd) + epsilon)),
             (W1, W1 - learning_rate_upd * mW1_upd / (T.sqrt(vW1_upd) + epsilon)),
             (W2, W2 - learning_rate_upd * mW2_upd / (T.sqrt(vW2_upd) + epsilon)),
             (b1, b1 - learning_rate_upd * mb1_upd / (T.sqrt(vb1_upd) + epsilon)),
             (b2, b2 - learning_rate_upd * mb2_upd / (T.sqrt(vb2_upd) + epsilon)),
             (V, V - learning_rate_upd * mV_upd / (T.sqrt(vV_upd) + epsilon)),
             (c, c - learning_rate_upd * mc_upd / (T.sqrt(vc_upd) + epsilon)),            
             (mU1, mU1_upd),
             (mU2, mU2_upd),
             (mW1, mW1_upd),
             (mW2, mW2_upd),
             (mb1, mb1_upd),
Example #25
0
    def init_function(self):
        sigmoid, tanh = T.nnet.sigmoid, T.tanh
        logging.info('init function...')

        self.seq_idx = T.lvector()
        self.solution = T.matrix()
        self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0)

        h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like(
            self.bc, dtype=theano.config.floatX)

        def encode(x_t, h_fore, c_fore):
            v = T.concatenate([h_fore, x_t])
            f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
            i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
            o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
            c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
            h_next = o_t * T.tanh(c_next)
            return h_next, c_next

        scan_result, _ = theano.scan(fn=encode,
                                     sequences=[self.seq_matrix],
                                     outputs_info=[h, c])
        embedding = scan_result[0][-1]

        self.use_noise = theano.shared(
            np.asarray(0., dtype=theano.config.floatX))

        if self.dropout == 1:
            embedding_for_train = embedding * self.srng.binomial(
                embedding.shape, p=0.5, n=1, dtype=embedding.dtype)
            embedding_for_test = embedding * 0.5
        else:
            embedding_for_train = embedding
            embedding_for_test = embedding

        self.pred_for_train = T.nnet.softmax(
            T.dot(embedding_for_train, self.Ws) + self.bs)
        self.pred_for_test = T.nnet.softmax(
            T.dot(embedding_for_test, self.Ws) + self.bs)

        self.l2 = sum([T.sum(param**2)
                       for param in self.params]) - T.sum(self.Vw**2)
        self.loss_sen = -T.tensordot(
            self.solution, T.log(self.pred_for_train), axes=2)
        self.loss_l2 = 0.5 * self.l2 * self.regular
        self.loss = self.loss_sen + self.loss_l2

        logging.info('getting grads...')
        grads = T.grad(self.loss, self.params)
        self.updates = collections.OrderedDict()
        self.grad = {}
        for param, grad in zip(self.params, grads):
            g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \
                                         dtype=theano.config.floatX))
            self.grad[param] = g
            self.updates[g] = g + grad

        logging.info("compiling func of train...")
        self.func_train = theano.function(
            inputs=[
                self.seq_idx, self.solution,
                theano.In(h, value=self.h0),
                theano.In(c, value=self.c0)
            ],
            outputs=[self.loss, self.loss_sen, self.loss_l2],
            updates=self.updates,
            on_unused_input='warn')
        logging.info("compiling func of test...")
        self.func_test = theano.function(inputs=[
            self.seq_idx,
            theano.In(h, value=self.h0),
            theano.In(c, value=self.c0)
        ],
                                         outputs=self.pred_for_test,
                                         on_unused_input='warn')
        self.func_encode = theano.function(inputs=[
            self.seq_idx,
            theano.In(h, value=self.h0),
            theano.In(c, value=self.c0)
        ],
                                           outputs=embedding,
                                           on_unused_input='warn')
    def build_minibatch(self, batch_size):
        '''
            dimension:  n_steps * batch_size * embed_dim
        :return:
        '''
        V, U, W, b, c = self.V, self.U, self.W, self.b, self.c

        x = T.tensor3('x')
        y = T.ivector('y')
        m = T.ivector('mask')
        self.batch_size = batch_size

        def forward_prop_step(x_t, s_t_prev):
            # GRU Layer
            z_t = T.nnet.hard_sigmoid(T.dot(x_t, U[0]) + T.dot(s_t_prev, W[0]) + b[0])
            r_t = T.nnet.hard_sigmoid(T.dot(x_t, U[1]) + T.dot(s_t_prev, W[1]) + b[1])
            c_t = T.tanh(T.dot(x_t, U[2]) + T.dot((s_t_prev*r_t), W[2]) + b[2])
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev
            y_t = T.nnet.softmax(T.dot(s_t, V) + c)

            return [s_t, y_t]


        [s, y_t], _ = theano.scan(
            forward_prop_step,
            sequences=[x],
            truncate_gradient=self.bptt_truncate,
            outputs_info=[dict(initial=T.zeros((batch_size, self.hidden_dim))), None])

        # Final output calculation
        # Theano's softmax returns a matrix with one row, we only need the row
        # p_y = T.nnet.softmax(T.dot(s[-1], V) + c)  # [0]

        y_t = y_t.dimshuffle((1,0,2)).reshape((y_t.shape[0]*y_t.shape[1], y_t.shape[2]))
        y_t1 = y_t[np.nonzero(m)]
        p_y = T.argmax(y_t1, axis=1)
        o_error = T.mean(T.nnet.categorical_crossentropy(y_t1, y))

        # Total cost (could add regularization here)
        self.cost = o_error

        # Assign functions
        self.predict = theano.function([x, m], y_t1)
        self.predict_class = theano.function([x, m], p_y)
        self.ce_error = theano.function([x, y, m], self.cost)

        # # Gradients
        dU = T.grad(self.cost, U)
        dW = T.grad(self.cost, W)
        db = T.grad(self.cost, b)
        dV = T.grad(self.cost, V)
        dc = T.grad(self.cost, c)

        self.bptt = theano.function([x, y, m], [dU, dW, db, dV, dc])

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        mU = decay * self.mU + (1 - decay) * dU ** 2
        mW = decay * self.mW + (1 - decay) * dW ** 2
        mV = decay * self.mV + (1 - decay) * dV ** 2
        mb = decay * self.mb + (1 - decay) * db ** 2
        mc = decay * self.mc + (1 - decay) * dc ** 2

        self.f_update = theano.function(
            [x, y, m, learning_rate, theano.In(decay, value=0.9)],
            [],
            updates=[
                     (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                     (self.mU, mU),
                     (self.mW, mW),
                     (self.mV, mV),
                     (self.mb, mb),
                     (self.mc, mc)
                    ])
Example #27
0
    def fit(self,
            X,
            bounds=None,
            constraints=None,
            use_gradient=True,
            optimizer=None,
            **kwargs):
        """Fit the distribution parameters to data by minimizing the negative
        log-likelihood of the data.

        Parameters
        ----------
        * `X` [array-like, shape=(n_samples, n_features)]:
            The samples.

        * `bounds` [list of (parameter, (low, high))]:
            The parameter bounds.

        * `constraints`:
            The constraints on the parameters.

        * `use_gradient` [boolean, default=True]:
            Whether to use exact gradients (if `True`) or numerical gradients
            (if `False`).

        * `optimizer` [string]:
            The optimization method.

        Returns
        -------
        * `self` [object]:
            `self`.
        """
        # Map parameters to placeholders
        param_to_placeholder = []
        param_to_index = {}

        for i, v in enumerate(self.parameters_):
            w = T.TensorVariable(v.type)
            param_to_placeholder.append((v, w))
            param_to_index[v] = i

        # Build bounds
        mapped_bounds = None

        if bounds is not None:
            mapped_bounds = [(None, None) for v in param_to_placeholder]

            for b in bounds:
                mapped_bounds[param_to_index[b["param"]]] = b["bounds"]

        # Build constraints
        mapped_constraints = None

        if constraints is not None:
            mapped_constraints = []

            for c in constraints:
                args = c["param"]
                if isinstance(args, SharedVariable):
                    args = (args, )

                m_c = {
                    "type":
                    c["type"],
                    "fun":
                    lambda x: c["fun"](*[x[param_to_index[a]] for a in args])
                }

                if "jac" in c:
                    m_c["jac"] = lambda x: c["jac"](
                        *[x[param_to_index[a]] for a in args])

                mapped_constraints.append(m_c)

        # Derive objective and gradient
        objective_ = theano.function(
            [self.X] + [w for _, w in param_to_placeholder] +
            [theano.In(v, name=v.name) for v in self.observeds_],
            T.sum(self.nll_),
            givens=param_to_placeholder,
            allow_input_downcast=True)

        def objective(x):
            return objective_(X, *x, **kwargs) / len(X)

        if use_gradient:
            gradient_ = theano.function(
                [self.X] + [w for _, w in param_to_placeholder] +
                [theano.In(v, name=v.name) for v in self.observeds_],
                theano.grad(T.sum(self.nll_),
                            [v for v, _ in param_to_placeholder]),
                givens=param_to_placeholder,
                allow_input_downcast=True)

            def gradient(x):
                return np.array(gradient_(X, *x, **kwargs)) / len(X)

        # Solve!
        x0 = np.array([v.get_value() for v, _ in param_to_placeholder])
        r = minimize(objective,
                     jac=gradient if use_gradient else None,
                     x0=x0,
                     method=optimizer,
                     bounds=mapped_bounds,
                     constraints=mapped_constraints)

        if r.success:
            # Assign the solution
            for i, value in enumerate(r.x):
                param_to_placeholder[i][0].set_value(value)

        else:
            print("Parameter fitting failed!")
            print(r)

        return self
Example #28
0
# 5 - theano.function
"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import numpy as np
import theano
import theano.tensor as T

# activation function example
x = T.dmatrix('x')
s = 1 / (1 + T.exp(-x))  # logistic or soft step
logistic = theano.function([x], s)
print(logistic([[0, 1], [-1, -2]]))

# multiply outputs for a function
a, b = T.dmatrices('a', 'b')
diff = a - b
abs_diff = abs(diff)
diff_squared = diff**2
f = theano.function([a, b], [diff, abs_diff, diff_squared])
print(f(np.ones((2, 2)), np.arange(4).reshape((2, 2))))

# default value and name for a function
x, y, w = T.dscalars('x', 'y', 'w')
z = (x + y) * w
f = theano.function(
    [x, theano.In(y, value=1),
     theano.In(w, value=2, name='weights')], z)
print(f(23, 2, weights=4))
Example #29
0
    def fit(self, X, bounds=None, constraints=None, use_gradient=True,
            **kwargs):
        # Map parameters to placeholders
        param_to_placeholder = []
        param_to_index = {}

        for i, v in enumerate(self.parameters_):
            w = T.TensorVariable(v.type)
            param_to_placeholder.append((v, w))
            param_to_index[v] = i

        # Build bounds
        mapped_bounds = None

        if bounds is not None:
            mapped_bounds = [(None, None) for v in param_to_placeholder]

            for b in bounds:
                mapped_bounds[param_to_index[b["param"]]] = b["bounds"]

        # Build constraints
        mapped_constraints = None

        if constraints is not None:
            mapped_constraints = []

            for c in constraints:
                args = c["param"]
                if isinstance(args, SharedVariable):
                    args = (args, )

                m_c = {
                    "type": c["type"],
                    "fun": lambda x: c["fun"](*[x[param_to_index[a]]
                                                for a in args])
                }

                if "jac" in c:
                    m_c["jac"] = lambda x: c["jac"](*[x[param_to_index[a]]
                                                      for a in args])

                mapped_constraints.append(m_c)

        # Derive objective and gradient
        objective_ = theano.function(
            [self.X] + [w for _, w in param_to_placeholder] +
            [theano.In(v, name=v.name) for v in self.observeds_],
            T.sum(self.nnlf_),
            givens=param_to_placeholder,
            allow_input_downcast=True)

        def objective(x):
            return objective_(X, *x, **kwargs) / len(X)

        if use_gradient:
            gradient_ = theano.function(
                [self.X] + [w for _, w in param_to_placeholder] +
                [theano.In(v, name=v.name) for v in self.observeds_],
                theano.grad(T.sum(self.nnlf_),
                            [v for v, _ in param_to_placeholder]),
                givens=param_to_placeholder,
                allow_input_downcast=True)

            def gradient(x):
                return np.array(gradient_(X, *x, **kwargs)) / len(X)

        # Solve!
        x0 = np.array([v.get_value() for v, _ in param_to_placeholder])
        r = minimize(objective,
                     jac=gradient if use_gradient else None,
                     x0=x0,
                     method=self.optimizer,
                     bounds=mapped_bounds,
                     constraints=mapped_constraints)

        if r.success:
            # Assign the solution
            for i, value in enumerate(r.x):
                param_to_placeholder[i][0].set_value(value)

        else:
            print("Parameter fitting failed!")
            print(r)

        return self
Example #30
0
    def __theano_build__(self):
        E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c

        x = T.ivector('x')
        y = T.ivector('y')

        def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev):
            # Word embedding layer
            x_e = E[:, x_t]

            # GRU Layer 1
            z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) +
                                       b[0])
            r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) +
                                       b[1])
            c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
            s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev

            # GRU Layer 2
            z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) +
                                       b[3])
            r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) +
                                       b[4])
            c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
            s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev

            # GRU Layer 3
            z_t3 = T.nnet.hard_sigmoid(U[6].dot(s_t2) + W[6].dot(s_t3_prev) +
                                       b[6])
            r_t3 = T.nnet.hard_sigmoid(U[7].dot(s_t2) + W[7].dot(s_t3_prev) +
                                       b[7])
            c_t3 = T.tanh(U[8].dot(s_t2) + W[8].dot(s_t3_prev * r_t3) + b[8])
            s_t3 = (T.ones_like(z_t3) - z_t3) * c_t3 + z_t3 * s_t3_prev

            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            o_t = T.nnet.softmax(V.dot(s_t3) + c)[0]

            return [o_t, s_t1, s_t2, s_t3]

        [o, s, s2,
         s3], updates = theano.scan(forward_prop_step,
                                    sequences=x,
                                    truncate_gradient=self.bptt_truncate,
                                    outputs_info=[
                                        None,
                                        dict(initial=T.zeros(self.hidden_dim)),
                                        dict(initial=T.zeros(self.hidden_dim)),
                                        dict(initial=T.zeros(self.hidden_dim))
                                    ])

        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
        p_o = printing.Print('o_error')
        # Total cost (could add regularization here)
        cost = p_o(o_error)

        # Gradients
        dE = T.grad(cost, E)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)

        # Assign functions
        self.predict = theano.function([x], [o], allow_input_downcast=True)
        self.predict_class = theano.function([x],
                                             prediction,
                                             allow_input_downcast=True)
        self.ce_error = theano.function([x, y],
                                        cost,
                                        allow_input_downcast=True)
        self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc],
                                    allow_input_downcast=True)

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        mE = decay * self.mE + (1 - decay) * dE**2
        mU = decay * self.mU + (1 - decay) * dU**2
        mW = decay * self.mW + (1 - decay) * dW**2
        mV = decay * self.mV + (1 - decay) * dV**2
        mb = decay * self.mb + (1 - decay) * db**2
        mc = decay * self.mc + (1 - decay) * dc**2

        self.sgd_step = theano.function(
            [x, y, learning_rate,
             theano.In(decay, value=0.9)], [],
            updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
                     (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                     (self.mE, mE), (self.mU, mU), (self.mW, mW),
                     (self.mV, mV), (self.mb, mb), (self.mc, mc)],
            allow_input_downcast=True)