Exemple #1
0
class Leaf:
    """
    special fields
    + params : white list of params to optimize
    + updates : white list of updates to optimize
    """
    is_train = tt.bscalar()

    def __call__(self, *args, **kwargs):
        raise NotImplementedError("implement in a derived class")

    def get_params(self):
        if hasattr(self, "params"):
            return search_shared(self.params)
        return search_shared(getattr(self, s) for s in dir(self))

    def get_updates(self):
        return search_updates(self)

    def optimize(self, cost: tt.Variable, optimizer: Optimizer):
        updates = optimizer.updates(self.get_params(), cost)
        updates.update(self.get_updates())
        return updates

    def state_list(self):
        return list(p.get_value() for p in self.get_params())

    def load_state_list(self, state):
        for p, s in zip(self.get_params(), state):
            p.set_value(s)
Exemple #2
0
    def __init__(self, data, config, fast_predict=False):
        self.embedding_shapes = data.embedding_shapes
        self.lstm_type = config.lstm_cell
        self.lstm_hidden_size = int(config.lstm_hidden_size)
        self.num_lstm_layers = int(config.num_lstm_layers)
        self.max_grad_norm = float(config.max_grad_norm)

        self.vocab_size = data.word_dict.size()
        self.label_space_size = data.label_dict.size()
        self.unk_id = data.unk_id

        # Initialize layers and parameters
        self.embedding_layer = EmbeddingLayer(data.embedding_shapes,
                                              data.embeddings)
        self.params = [p for p in self.embedding_layer.params]

        self.rnn_layers = [None] * self.num_lstm_layers
        for l in range(self.num_lstm_layers):
            input_dim = self.embedding_layer.output_size if l == 0 else self.lstm_hidden_size
            input_dropout = config.input_dropout_prob if (
                config.per_layer_dropout or l == 0) else 0.0
            recurrent_dropout = config.recurrent_dropout_prob

            self.rnn_layers[l] = get_rnn_layer(self.lstm_type)(
                input_dim,
                self.lstm_hidden_size,
                input_dropout_prob=input_dropout,
                recurrent_dropout_prob=recurrent_dropout,
                fast_predict=fast_predict,
                prefix='lstm_{}'.format(l))
            self.params.extend(self.rnn_layers[l].params)

        self.softmax_layer = SoftmaxLayer(self.lstm_hidden_size,
                                          self.label_space_size)
        self.params.extend(self.softmax_layer.params)

        # Build model
        # Shape of x: [seq_len, batch_size, num_features]
        self.x0 = tensor.ltensor3('x')
        self.y0 = tensor.lmatrix('y')
        self.mask0 = tensor.matrix('mask', dtype=floatX)
        self.is_train = tensor.bscalar('is_train')

        self.x = self.x0.dimshuffle(1, 0, 2)
        self.y = self.y0.dimshuffle(1, 0)
        self.mask = self.mask0.dimshuffle(1, 0)

        self.inputs = [None] * (self.num_lstm_layers + 1)
        self.inputs[0] = self.embedding_layer.connect(self.x)
        self.rev_mask = self.mask[::-1]

        for l, rnn in enumerate(self.rnn_layers):
            outputs = rnn.connect(self.inputs[l],
                                  self.mask if l % 2 == 0 else self.rev_mask,
                                  self.is_train)
            self.inputs[l + 1] = outputs[::-1]

        self.scores, self.pred = self.softmax_layer.connect(self.inputs[-1])
        self.pred0 = self.pred.reshape([self.x.shape[0],
                                        self.x.shape[1]]).dimshuffle(1, 0)
Exemple #3
0
def test_replacements(binomial_model_inference):
    d = tt.bscalar()
    d.tag.test_value = 1
    approx = binomial_model_inference.approx
    p = approx.model.p
    p_t = p ** 3
    p_s = approx.sample_node(p_t)
    if theano.config.compute_test_value != 'off':
        assert p_s.tag.test_value.shape == p_t.tag.test_value.shape
    sampled = [p_s.eval() for _ in range(100)]
    assert any(map(
        operator.ne,
        sampled[1:], sampled[:-1])
    )  # stochastic

    p_d = approx.sample_node(p_t, deterministic=True)
    sampled = [p_d.eval() for _ in range(100)]
    assert all(map(
        operator.eq,
        sampled[1:], sampled[:-1])
    )  # deterministic

    p_r = approx.sample_node(p_t, deterministic=d)
    sampled = [p_r.eval({d: 1}) for _ in range(100)]
    assert all(map(
        operator.eq,
        sampled[1:], sampled[:-1])
    )  # deterministic
    sampled = [p_r.eval({d: 0}) for _ in range(100)]
    assert any(map(
        operator.ne,
        sampled[1:], sampled[:-1])
    )  # stochastic
Exemple #4
0
    def random_fn(self):
        """
        Implements posterior distribution from initial latent space

        Parameters
        ----------
        size : number of samples from distribution
        no_rand : whether use deterministic distribution

        Returns
        -------
        posterior space (numpy)
        """
        In = theano.In
        size = tt.iscalar('size')
        no_rand = tt.bscalar('no_rand')
        posterior = self.random(size, no_rand)
        fn = theano.function([
            In(size, 'size', 1, allow_downcast=True),
            In(no_rand, 'no_rand', 0, allow_downcast=True)
        ], posterior)

        def inner(size=None, no_rand=False):
            if size is None:
                return fn(1, int(no_rand))[0]
            else:
                return fn(size, int(no_rand))

        return inner
Exemple #5
0
    def test_param_allow_downcast_int(self):
        a = tensor.wvector('a')  # int16
        b = tensor.bvector('b')  # int8
        c = tensor.bscalar('c')  # int8
        f = pfunc([
            Param(a, allow_downcast=True),
            Param(b, allow_downcast=False),
            Param(c, allow_downcast=None)
        ], (a + b + c))

        # Both values are in range. Since they're not ndarrays (but lists),
        # they will be converted, and their value checked.
        assert numpy.all(f([3], [6], 1) == 10)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
        self.assertRaises(TypeError, f, [3], numpy.array([6], dtype='int16'),
                          1)

        # Value too big for a, silently ignored
        assert numpy.all(f([2**20], numpy.ones(1, dtype='int8'), 1) == 2)

        # Value too big for b, raises TypeError
        self.assertRaises(TypeError, f, [3], [312], 1)

        # Value too big for c, raises TypeError
        self.assertRaises(TypeError, f, [3], [6], 806)
Exemple #6
0
    def test_allow_input_downcast_int(self):
        a = tensor.wvector('a')  # int16
        b = tensor.bvector('b')  # int8
        c = tensor.bscalar('c')  # int8

        f = pfunc([a, b, c], (a + b + c), allow_input_downcast=True)
        # Value too big for a, b, or c, silently ignored
        assert f([2**20], [1], 0) == 1
        assert f([3], [312], 0) == 59
        assert f([3], [1], 806) == 42

        g = pfunc([a, b, c], (a + b + c), allow_input_downcast=False)
        # All values are in range. Since they're not ndarrays (but lists
        # or scalars), they will be converted, and their value checked.
        assert numpy.all(g([3], [6], 0) == 9)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
        self.assertRaises(TypeError, g, [3], numpy.array([6], dtype='int16'),
                          0)

        # Value too big for b, raises TypeError
        self.assertRaises(TypeError, g, [3], [312], 0)

        h = pfunc([a, b, c], (a + b + c))  # Default: allow_input_downcast=None
        # Everything here should behave like with False
        assert numpy.all(h([3], [6], 0) == 9)
        self.assertRaises(TypeError, h, [3], numpy.array([6], dtype='int16'),
                          0)
        self.assertRaises(TypeError, h, [3], [312], 0)
Exemple #7
0
    def test_param_allow_downcast_int(self):
        a = tensor.wvector("a")  # int16
        b = tensor.bvector("b")  # int8
        c = tensor.bscalar("c")  # int8
        f = pfunc(
            [
                In(a, allow_downcast=True),
                In(b, allow_downcast=False),
                In(c, allow_downcast=None),
            ],
            (a + b + c),
        )

        # Both values are in range. Since they're not ndarrays (but lists),
        # they will be converted, and their value checked.
        assert np.all(f([3], [6], 1) == 10)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
        with pytest.raises(TypeError):
            f([3], np.array([6], dtype="int16"), 1)

        # Value too big for a, silently ignored
        assert np.all(f([2**20], np.ones(1, dtype="int8"), 1) == 2)

        # Value too big for b, raises TypeError
        with pytest.raises(TypeError):
            f([3], [312], 1)

        # Value too big for c, raises TypeError
        with pytest.raises(TypeError):
            f([3], [6], 806)
Exemple #8
0
    def test_param_allow_downcast_int(self):
        a = tensor.wvector('a')  # int16
        b = tensor.bvector('b')  # int8
        c = tensor.bscalar('c')  # int8
        f = pfunc([Param(a, allow_downcast=True),
                   Param(b, allow_downcast=False),
                   Param(c, allow_downcast=None)],
                  (a + b + c))

        # Both values are in range. Since they're not ndarrays (but lists),
        # they will be converted, and their value checked.
        assert numpy.all(f([3], [6], 1) == 10)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
        self.assertRaises(TypeError, f,
                [3], numpy.array([6], dtype='int16'), 1)

        # Value too big for a, silently ignored
        assert numpy.all(f([2 ** 20], numpy.ones(1, dtype='int8'), 1) == 2)

        # Value too big for b, raises TypeError
        self.assertRaises(TypeError, f, [3], [312], 1)

        # Value too big for c, raises TypeError
        self.assertRaises(TypeError, f, [3], [6], 806)
Exemple #9
0
    def test_allow_input_downcast_int(self):
        a = tensor.wvector('a')  # int16
        b = tensor.bvector('b')  # int8
        c = tensor.bscalar('c')  # int8

        f = pfunc([a, b, c], (a + b + c), allow_input_downcast=True)
        # Value too big for a, b, or c, silently ignored
        assert f([2 ** 20], [1], 0) == 1
        assert f([3], [312], 0) == 59
        assert f([3], [1], 806) == 42

        g = pfunc([a, b, c], (a + b + c), allow_input_downcast=False)
        # All values are in range. Since they're not ndarrays (but lists
        # or scalars), they will be converted, and their value checked.
        assert numpy.all(g([3], [6], 0) == 9)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
        self.assertRaises(TypeError, g,
                [3], numpy.array([6], dtype='int16'), 0)

        # Value too big for b, raises TypeError
        self.assertRaises(TypeError, g, [3], [312], 0)

        h = pfunc([a, b, c], (a + b + c))  # Default: allow_input_downcast=None
        # Everything here should behave like with False
        assert numpy.all(h([3], [6], 0) == 9)
        self.assertRaises(TypeError, h,
                [3], numpy.array([6], dtype='int16'), 0)
        self.assertRaises(TypeError, h, [3], [312], 0)
Exemple #10
0
    def random_fn(self):
        """
        Implements posterior distribution from initial latent space

        Parameters
        ----------
        size : number of samples from distribution
        no_rand : whether use deterministic distribution

        Returns
        -------
        posterior space (numpy)
        """
        In = theano.In
        size = tt.iscalar('size')
        no_rand = tt.bscalar('no_rand')
        posterior = self.random(size, no_rand)
        fn = theano.function([In(size, 'size', 1, allow_downcast=True),
                              In(no_rand, 'no_rand', 0, allow_downcast=True)],
                             posterior)

        def inner(size=None, no_rand=False):
            if size is None:
                return fn(1, int(no_rand))[0]
            else:
                return fn(size, int(no_rand))

        return inner
 def __init__(self, n_features):
     self.n_features = n_features
     self.x = T.fvector("x")
     self.y = T.bscalar("y")
     self.W = theano.shared(rng.randn(n_features).astype(
         theano.config.floatX),
                            name="W")
     self.b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX),
                            name="b")
Exemple #12
0
    def get_cost(self):
        self.y = T.bscalar('y')

        self.L1 = abs(self.hidden_layer.W).sum() \
                + abs(self.output_layer.W).sum()

        self.L2_sqr = (self.hidden_layer.W ** 2).sum() \
                + (self.output_layer.W ** 2).sum()

        self.params = self.hidden_layer.params + self.output_layer.params
        self.cost = self.negative_log_likelihood(self.y) \
            + self.L2_reg * self.L2_sqr
        #+ self.L1_reg * self.L1
        return self.cost
Exemple #13
0
    def get_cost(self):
        self.y = T.bscalar('y')

        self.L1 = abs(self.hidden_layer.W).sum() \
                + abs(self.output_layer.W).sum()

        self.L2_sqr = (self.hidden_layer.W ** 2).sum() \
                + (self.output_layer.W ** 2).sum()

        self.params = self.hidden_layer.params + self.output_layer.params
        self.cost = self.negative_log_likelihood(self.y) \
            + self.L2_reg * self.L2_sqr 
        #+ self.L1_reg * self.L1 
        return self.cost
    def __init__(self,
                 input=None,
                 output=None,
                 n_features=500,
                 n_states=10,
                 learning_rate=0.01):

        self.n_features = n_features
        self.n_states = n_states
        # x is a vector
        self.x = input
        if not self.x:
            self.x = T.fvector('x')
        # y is a label(0 1 2 3 ..)
        self.y = output
        if not self.y:
            self.y = T.bscalar('y')
        # test value
        #self.x.tag.test_value = rng.random(n_features).astype(
        #    theano.config.floatX)
        #self.y.tag.test_value = 3
        #self.y = T.cast(y, 'int32')

        self.b = theano.shared(
            numpy.zeros((n_states), dtype=theano.config.floatX),
            name='b',
            borrow=True,
        )
        self.W = theano.shared(
            value=numpy.zeros((n_features, n_states),
                              dtype=theano.config.floatX),
            name='W',
            borrow=True,
        )

        self.p_y_given_x = T.nnet.softmax(T.dot(self.W, self.x) + self.b)
        # get the max index
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        self.get_y_pred = theano.function(inputs=[self.x],
                                          allow_input_downcast=True,
                                          outputs=self.y_pred)

        self.learning_rate = learning_rate

        self.params = [self.W, self.b]
Exemple #15
0
def test_replacements(binomial_model_inference):
    d = tt.bscalar()
    d.tag.test_value = 1
    approx = binomial_model_inference.approx
    p = approx.model.p
    p_t = p**3
    p_s = approx.apply_replacements(p_t)
    sampled = [p_s.eval() for _ in range(100)]
    assert any(map(operator.ne, sampled[1:], sampled[:-1]))  # stochastic

    p_d = approx.apply_replacements(p_t, deterministic=True)
    sampled = [p_d.eval() for _ in range(100)]
    assert all(map(operator.eq, sampled[1:], sampled[:-1]))  # deterministic

    p_r = approx.apply_replacements(p_t, deterministic=d)
    sampled = [p_r.eval({d: 1}) for _ in range(100)]
    assert all(map(operator.eq, sampled[1:], sampled[:-1]))  # deterministic
    sampled = [p_r.eval({d: 0}) for _ in range(100)]
    assert any(map(operator.ne, sampled[1:], sampled[:-1]))  # stochastic
Exemple #16
0
def build_trainer(phi_shared, N, loglik_primary_f, logprior_f, hypernet_f,
                  log_det_dtheta_dz_f=None, primary_f=None):
    '''It is assumed every time this is called z_noise will be drawn from a 
    standard Gaussian. phi_shared are weights to hypernet and N is the total
    number of points in the data set.'''
    X = T.matrix('x')
    y = T.matrix('y')  # Assuming multivariate output
    z_noise = T.vector('z')
    prelim = T.bscalar('prelim')

    lr = T.scalar('lr')

    elbo = hypernet_elbo(X, y, loglik_primary_f, logprior_f, hypernet_f,
                         z_noise, N, log_det_dtheta_dz_f=log_det_dtheta_dz_f,
                         prelim=prelim)
    loss = -elbo
    grads = T.grad(loss, phi_shared)
    updates = lasagne.updates.adam(grads, phi_shared, learning_rate=lr)

    trainer = theano.function([X, y, z_noise, lr, prelim], loss, updates=updates)

    # Build get_err in case you want to check Jacobian logic
    elbo_no_J = hypernet_elbo(X, y, loglik_primary_f, logprior_f, hypernet_f,
                              z_noise, N, prelim=prelim)
    err = T.abs_(elbo - elbo_no_J)
    get_err = theano.function([X, y, z_noise, prelim], err)

    theta = hypernet_f(z_noise, prelim=prelim)
    theta_f = theano.function([z_noise, prelim], theta)

    test_loglik = loglik_primary_f(X, y, theta)
    test_f = theano.function([X, y, z_noise, prelim], test_loglik)

    primary_out = None
    if primary_f is not None:
        yp = primary_f(X, theta)
        primary_out = theano.function([X, z_noise, prelim], yp)

    grad_f = theano.function([X, y, z_noise, prelim], grads)
    return trainer, get_err, test_f, primary_out, grad_f, theta_f
def test_mlp(learning_rate=0.001, L1_reg=0.0, L2_reg=1, n_epochs=10000,
             dataset='Carolyn1_filt_turnclass.csv', batch_size=3, n_hidden=20):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see
    regularization)

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


   """
    datasets = load_data(dataset,'Carolyn1_filt_turnlabels.csv')

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    z = T.bscalar()

    rng = numpy.random.RandomState(1234)

    # construct the MLP class
    classifier = MLP(
        rng=rng,
        input=x,
        n_in=24,
        n_hidden=n_hidden,
        n_out=2,
        ts=z
    )

    # start-snippet-4
    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = (
        classifier.negative_log_likelihood(y)
        + L1_reg * classifier.L1
        + L2_reg * classifier.L2_sqr
    )
    # end-snippet-4

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size],
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size],
        }
    )

    # start-snippet-5
    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = [T.grad(cost, param) for param in classifier.params]

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs

    # given two lists of the same length, A = [a1, a2, a3, a4] and
    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
    # element is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    updates = [
        (param, param - learning_rate * gparam)
        for param, gparam in zip(classifier.params, gparams)
    ]

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
        }
    )
    # end-snippet-5

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            classifier.ts = True
            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                classifier.ts = False
                validation_losses = [validate_model(i) for i
                                     in range(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if (
                        this_validation_loss < best_validation_loss *
                        improvement_threshold
                    ):
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    classifier.ts = False
                    test_losses = [test_model(i) for i
                                   in range(n_test_batches)]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(('The code for file ' +
           os.path.split(__file__)[1] +
           ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
Exemple #18
0
    def __init__(
        self,
        dataset,
        learning_rate=0.001,
        decrease_constant=0,
        hidden_sizes=[500],
        random_seed=1234,
        batch_size=1,
        hidden_activation=T.nnet.sigmoid,
        use_cond_mask=False,
        direct_input_connect="None",
        direct_output_connect=False,
        update_rule="None",
        dropout_rate=0,
        weights_initialization="Uniform",
        mask_distribution=0,
    ):

        input_size = dataset["input_size"]
        self.shuffled_once = False

        class SeedGenerator(object):
            # This subclass purpose is to maximize randomness and still keep reproducibility

            def __init__(self, random_seed):
                self.rng = np.random.mtrand.RandomState(random_seed)

            def get(self):
                return self.rng.randint(42424242)

        self.seed_generator = SeedGenerator(random_seed)

        self.trng = T.shared_randomstreams.RandomStreams(self.seed_generator.get())

        weights_initialization = getattr(
            WeightsInitializer(self.seed_generator.get()), weights_initialization
        )  # Get the weights initializer by string name

        # Building the model's graph
        input = T.matrix(name="input")
        target = T.matrix(name="target")
        is_train = T.bscalar(name="is_train")

        # Initialize the mask
        self.mask_generator = MaskGenerator(input_size, hidden_sizes, mask_distribution, self.seed_generator.get())

        # Initialize layers
        input_layer = ConditionningMaskedLayer(
            layerIdx=0,
            input=input,
            n_in=input_size,
            n_out=hidden_sizes[0],
            activation=hidden_activation,
            weights_initialization=weights_initialization,
            mask_generator=self.mask_generator,
            use_cond_mask=use_cond_mask,
        )
        self.layers = [dropoutLayerDecorator(input_layer, self.trng, is_train, dropout_rate)]
        # Now the hidden layers
        for i in range(1, len(hidden_sizes)):
            previous_layer = self.layers[i - 1]
            hidden_layer = DirectInputConnectConditionningMaskedLayer(
                layerIdx=i,
                input=previous_layer.output,
                n_in=hidden_sizes[i - 1],
                n_out=hidden_sizes[i],
                activation=hidden_activation,
                weights_initialization=weights_initialization,
                mask_generator=self.mask_generator,
                use_cond_mask=use_cond_mask,
                direct_input=input if direct_input_connect == "Full" and previous_layer.output != input else None,
            )
            self.layers += [dropoutLayerDecorator(hidden_layer, self.trng, is_train, dropout_rate)]
        # And the output layer
        outputLayerIdx = len(self.layers)
        previous_layer = self.layers[outputLayerIdx - 1]
        self.layers += [
            DirectOutputInputConnectConditionningMaskedOutputLayer(
                layerIdx=outputLayerIdx,
                input=previous_layer.output,
                n_in=hidden_sizes[outputLayerIdx - 1],
                n_out=input_size,
                activation=T.nnet.sigmoid,
                weights_initialization=weights_initialization,
                mask_generator=self.mask_generator,
                use_cond_mask=use_cond_mask,
                direct_input=input
                if (direct_input_connect == "Full" or direct_input_connect == "Output")
                and previous_layer.output != input
                else None,
                direct_outputs=[
                    (layer.layer_idx, layer.n_in, layer.input) for layerIdx, layer in enumerate(self.layers[1:-1])
                ]
                if direct_output_connect
                else [],
            )
        ]

        # The loss function
        output = self.layers[-1].output
        pre_output = self.layers[-1].lin_output
        log_prob = -T.sum(T.nnet.softplus(-target * pre_output + (1 - target) * pre_output), axis=1)
        loss = (-log_prob).mean()

        # How to update the parameters
        self.parameters = [param for layer in self.layers for param in layer.params]
        parameters_gradient = T.grad(loss, self.parameters)

        # Initialize update_rule
        if update_rule == "None":
            self.update_rule = DecreasingLearningRate(learning_rate, decrease_constant)
        elif update_rule == "adadelta":
            self.update_rule = AdaDelta(decay=decrease_constant, epsilon=learning_rate)
        elif update_rule == "adagrad":
            self.update_rule = AdaGrad(learning_rate=learning_rate)
        elif update_rule == "rmsprop":
            self.update_rule = RMSProp(learning_rate=learning_rate, decay=decrease_constant)
        elif update_rule == "adam":
            self.update_rule = Adam(learning_rate=learning_rate)
        elif update_rule == "adam_paper":
            self.update_rule = Adam_paper(learning_rate=learning_rate)
        updates = self.update_rule.get_updates(zip(self.parameters, parameters_gradient))

        # How to to shuffle weights
        masks_updates = [layer_mask_update for layer in self.layers for layer_mask_update in layer.shuffle_update]
        self.update_masks = theano.function(name="update_masks", inputs=[], updates=masks_updates)
        #
        # Functions to train and use the model
        index = T.lscalar()
        self.learn = theano.function(
            name="learn",
            inputs=[index, is_train],
            outputs=loss,
            updates=updates,
            givens={
                input: dataset["train"]["data"][index * batch_size : (index + 1) * batch_size],
                target: dataset["train"]["data"][index * batch_size : (index + 1) * batch_size],
            },
            on_unused_input="ignore",
        )  # ignore for when dropout is absent

        self.use = theano.function(
            name="use", inputs=[input, is_train], outputs=output, on_unused_input="ignore"
        )  # ignore for when dropout is absent

        # Test functions
        self.valid_log_prob = theano.function(
            name="valid_log_prob",
            inputs=[is_train],
            outputs=log_prob,
            givens={input: dataset["valid"]["data"], target: dataset["valid"]["data"]},
            on_unused_input="ignore",
        )  # ignore for when dropout is absent
        self.train_log_prob = theano.function(
            name="train_log_prob",
            inputs=[is_train],
            outputs=log_prob,
            givens={input: dataset["train"]["data"], target: dataset["train"]["data"]},
            on_unused_input="ignore",
        )  # ignore for when dropout is absent
        self.train_log_prob_batch = theano.function(
            name="train_log_prob_batch",
            inputs=[index, is_train],
            outputs=log_prob,
            givens={
                input: dataset["train"]["data"][index * 1000 : (index + 1) * 1000],
                target: dataset["train"]["data"][index * 1000 : (index + 1) * 1000],
            },
            on_unused_input="ignore",
        )  # ignore for when dropout is absent
        self.test_log_prob = theano.function(
            name="test_log_prob",
            inputs=[is_train],
            outputs=log_prob,
            givens={input: dataset["test"]["data"], target: dataset["test"]["data"]},
            on_unused_input="ignore",
        )  # ignore for when dropout is absent

        # Functions for verify gradient
        self.useloss = theano.function(
            name="useloss", inputs=[input, target, is_train], outputs=loss, on_unused_input="ignore"
        )  # ignore for when dropout is absent
        self.learngrad = theano.function(
            name="learn",
            inputs=[index, is_train],
            outputs=parameters_gradient,
            givens={
                input: dataset["train"]["data"][index * batch_size : (index + 1) * batch_size],
                target: dataset["train"]["data"][index * batch_size : (index + 1) * batch_size],
            },
            on_unused_input="ignore",
        )  # ignore for when dropout is absent
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=200,
                    dataset='mnist.pkl.gz',
                    nkerns=[20, 50],
                    batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    layers = []

    srng = RandomStreams(25252)

    train_flag = T.bscalar('train_flag')

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)

    layer0 = ConvLayer(rng,
                       data=layer0_input,
                       image_shape=(batch_size, 1, 28, 28),
                       filter_shape=(nkerns[0], 1, 5, 5))

    layers.append(layer0)

    layer1 = PoolLayer(data=layer0.output)
    layers.append(layer1)

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer2 = ConvLayer(rng,
                       data=layer1.output,
                       image_shape=(batch_size, nkerns[0], 12, 12),
                       filter_shape=(nkerns[1], nkerns[0], 5, 5))
    layers.append(layer2)

    layer3 = PoolLayer(data=layer2.output)
    layers.append(layer3)

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layerd_input = layer3.output.flatten(2)

    layerd = DropoutLayer(data=layerd_input,
                          n_in=nkerns[1] * 4 * 4,
                          srng=srng,
                          p=.5,
                          train_flag=train_flag)

    # construct a fully-connected sigmoidal layer
    layer4 = FCLayer(rng,
                     data=layerd.output,
                     n_in=nkerns[1] * 4 * 4,
                     n_out=500,
                     activation=relu)
    layers.append(layer4)

    # classify the values of the fully-connected sigmoidal layer
    layer5 = LogisticRegression(input=layer4.output, n_in=500, n_out=10)
    layers.append(layer5)

    # the cost we minimize during training is the NLL of the model
    cost = layer5.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer5.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size],
            train_flag: numpy.cast['int8'](0)
        })

    validate_model = theano.function(
        [index],
        layer5.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            train_flag: numpy.cast['int8'](0)
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer5.params + layerd.params + layer4.params + layer3.params + \
             layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    delta_before = []

    for param_i in params:
        delta_before_i = theano.shared(value=numpy.zeros(
            param_i.get_value().shape, dtype=theano.config.floatX),
                                       borrow=True)
        delta_before.append(delta_before_i)

    learning_rate = 0.01
    momentum = 0.9
    weight_decay = 0.0005

    updates = []

    for param_i, grad_i, delta_before_i in zip(params, grads, delta_before):
        delta_i = momentum * delta_before_i - weight_decay * learning_rate * param_i - learning_rate * grad_i
        updates.append((delta_before_i, delta_i))
        updates.append((param_i, param_i + delta_i))

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size],
            train_flag: numpy.cast['int8'](1)
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    if gpu_usage is True:
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        info = nvmlDeviceGetMemoryInfo(handle)
        print "Total memory:", info.total
        print "Free memory:", info.free

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print(
        'Best validation score of %f %% obtained at iteration %i, '
        'with test performance %f %%' %
        (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemple #20
0
    def __init__(self,
                 rng,
                 n_in,
                 n_out,
                 n_h,
                 dropout=0,
                 sigma_g=sigmoid,
                 sigma_c=hyperbolic_tangent,
                 sigma_h=hyperbolic_tangent,
                 sigma_y=softmax,
                 dropout_rate=0,
                 obj='c'):
        '''
        :param rng: Numpy RandomState
        :param n_in: Input dimension (int)
        :param n_out: Output dimension (int)
        :param n_h: Hidden dimension (int)
        :param sigma_g, sigma_c, sigma_h, sigma_y: activation functions
        :param dropout_rate: dropout rate (float)
        :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r'])
        '''

        Wf_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)),
                          np.sqrt(6. / (n_in + n_h)), (n_h, n_in))
        Uf_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)),
                          np.sqrt(6. / (n_h + n_h)), (n_h, n_h))
        bf_ = np.zeros(n_h)

        Wi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)),
                          np.sqrt(6. / (n_in + n_h)), (n_h, n_in))
        Ui_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)),
                          np.sqrt(6. / (n_h + n_h)), (n_h, n_h))
        bi_ = np.zeros(n_h)

        Wo_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)),
                          np.sqrt(6. / (n_in + n_h)), (n_h, n_in))
        Uo_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)),
                          np.sqrt(6. / (n_h + n_h)), (n_h, n_h))
        bo_ = np.zeros(n_h)

        Wc_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)),
                          np.sqrt(6. / (n_in + n_h)), (n_h, n_in))
        Uc_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)),
                          np.sqrt(6. / (n_h + n_h)), (n_h, n_h))
        bc_ = np.zeros(n_h)

        Wy_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)),
                          np.sqrt(6. / (n_out + n_h)), (n_out, n_h))
        by_ = np.zeros(n_out)

        h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)),
                          n_h)
        c0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)),
                          n_h)

        # Theano: Created shared variables
        Wf = theano.shared(name='Wf', value=Wf_.astype(theano.config.floatX))
        Uf = theano.shared(name='Uf', value=Uf_.astype(theano.config.floatX))
        bf = theano.shared(name='bf', value=bf_.astype(theano.config.floatX))

        Wi = theano.shared(name='Wi', value=Wi_.astype(theano.config.floatX))
        Ui = theano.shared(name='Ui', value=Ui_.astype(theano.config.floatX))
        bi = theano.shared(name='bi', value=bi_.astype(theano.config.floatX))

        Wo = theano.shared(name='Wo', value=Wo_.astype(theano.config.floatX))
        Uo = theano.shared(name='Uo', value=Uo_.astype(theano.config.floatX))
        bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX))

        Wc = theano.shared(name='Wc', value=Wc_.astype(theano.config.floatX))
        Uc = theano.shared(name='Uc', value=Uc_.astype(theano.config.floatX))
        bc = theano.shared(name='bc', value=bc_.astype(theano.config.floatX))

        Wy = theano.shared(name='Wy', value=Wy_.astype(theano.config.floatX))
        by = theano.shared(name='by', value=by_.astype(theano.config.floatX))

        h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX))
        c0 = theano.shared(name='c0', value=c0_.astype(theano.config.floatX))

        self.p = [
            Wf, Uf, bf, Wi, Ui, bi, Wo, Uo, bo, Wc, Uc, bc, Wy, by, c0, h0
        ]

        seq_len = T.iscalar('seq_len')
        self.seq_len = seq_len

        self.x = T.vector()
        x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2)

        if dropout_rate > 0:
            np.random.seed(int(time.time()))

            # for training
            def masked_forward_prop_step(x_t, h_t_prev, c_t_prev):
                f_t = sigma_g(Wf.dot(x_t) + Uf.dot(h_t_prev) + bf)
                i_t = sigma_g(Wi.dot(x_t) + Ui.dot(h_t_prev) + bi)
                o_t = sigma_g(Wo.dot(x_t) + Uo.dot(h_t_prev) + bo)
                c_t = i_t * sigma_c(Wc.dot(x_t) + Uc.dot(h_t_prev) + bc)
                c_t += c_t_prev * f_t
                h_t = o_t * sigma_h(c_t)
                y_t = Wy.dot(h_t) + by
                mask = np.random.binomial(np.ones(n_h, dtype=int),
                                          1.0 - dropout_rate)
                masked_h_t = h_t * T.cast(mask, theano.config.floatX)

                return [y_t, masked_h_t, c_t]

            # for testing
            def forward_prop_step(x_t, h_t_prev, c_t_prev):
                f_t = sigma_g(Wf.dot(x_t) + Uf.dot(h_t_prev) + bf)
                i_t = sigma_g(Wi.dot(x_t) + Ui.dot(h_t_prev) + bi)
                o_t = sigma_g(Wo.dot(x_t) + Uo.dot(h_t_prev) + bo)
                c_t = i_t * sigma_c(Wc.dot(x_t) + Uc.dot(h_t_prev) + bc)
                c_t += c_t_prev * f_t
                h_t = o_t * sigma_h(c_t)
                h_t = (1.0 - dropout_rate) * h_t
                y_t = Wy.dot(h_t) + by

                return [y_t, h_t, c_t]

            [o_train, _, _], _ = theano.scan(masked_forward_prop_step,
                                             sequences=[x_scan],
                                             outputs_info=[None, h0, c0],
                                             n_steps=seq_len)

            [o_test, _, _], _ = theano.scan(forward_prop_step,
                                            sequences=[x_scan],
                                            outputs_info=[None, h0, c0],
                                            n_steps=seq_len)

        else:

            def forward_prop_step(x_t, h_t_prev, c_t_prev):
                f_t = sigma_g(Wf.dot(x_t) + Uf.dot(h_t_prev) + bf)
                i_t = sigma_g(Wi.dot(x_t) + Ui.dot(h_t_prev) + bi)
                o_t = sigma_g(Wo.dot(x_t) + Uo.dot(h_t_prev) + bo)
                c_t = i_t * sigma_c(Wc.dot(x_t) + Uc.dot(h_t_prev) + bc)
                c_t += c_t_prev * f_t
                h_t = o_t * sigma_h(c_t)
                y_t = Wy.dot(h_t) + by

                return [y_t, h_t, c_t]

            [o_train, _, _], _ = theano.scan(forward_prop_step,
                                             sequences=[x_scan],
                                             outputs_info=[None, h0, c0],
                                             n_steps=seq_len)
            o_test = o_train

        if obj == 'c':  # classification task
            self.y = T.bscalar('y')
            self.o_train = sigma_y(o_train[-1])
            self.o_test = sigma_y(o_test[-1])
            #obj function to compute grad, use dropout
            self.cost = T.nnet.categorical_crossentropy(
                self.o_train,
                T.eye(n_out)[self.y])
            #compute accuracy use average of dropout rate
            self.accuracy = T.switch(T.eq(T.argmax(self.o_test), self.y), 1.,
                                     0.)
            self.prediction = np.argmax(self.o_test)
        elif obj == 'r':  # regression task
            self.y = T.dscalar('y')
            self.o_train = o_train[-1]
            self.o_test = o_test[-1]
            #obj function to compute grad, use dropout
            self.cost = (self.o_train[0] - self.y)**2
            #compute accuracy use average of dropout rate
            self.accuracy = (self.o_test[0] - self.y)**2
            self.prediction = self.o_test[0]

        self.optimiser = sgd_optimizer(self, 'LSTM')
Exemple #21
0
                                               parse_mode=opts.parse_mode)
log("train_stats %s %s" % (len(train_x), train_stats))
dev_x, dev_y, dev_stats = util.load_data(opts.dev_set,
                                         vocab,
                                         update_vocab=False,
                                         max_egs=int(opts.num_from_dev),
                                         parse_mode=opts.parse_mode)
log("dev_stats %s %s" % (len(dev_x), dev_stats))

# input/output example vars
s1_idxs = T.ivector('s1')  # sequence for sentence one
s2_idxs = T.ivector('s2')  # sequence for sentence two
actual_y = T.ivector('y')  # single for sentence pair label; 0, 1 or 2

# dropout keep prob for post concat, pre MLP
apply_dropout = T.bscalar(
    'apply_dropout')  # dropout.{APPLY_DROPOUT|NO_DROPOUT}
keep_prob = theano.shared(opts.keep_prob)  # recall 1.0 => noop
keep_prob = T.cast(keep_prob,
                   'float32')  # shared weirdity, how to set in init (?)

# keep track of different "layers" that handle their own gradients.
# includes rnns, final concat & softmax and, potentially, special handling for
# tied embeddings
layers = []

# decide set of sequence idxs we'll be processing. there will always the two
# for the forward passes over s1 and s2 and, optionally, two more for the
# reverse pass over s1 & s2 in the bidirectional case.
idxs = [s1_idxs, s2_idxs]
names = ["s1f", "s2f"]
if opts.bidirectional:
            non_sequences=[n_words, word_dim],
            sequences=[index1, index2],
            outputs_info=[result_mat],
        )
        return (output[-1])

    def sents_ind_2vec(self, sents):
        # Create Input moddule contain positional encoding scheme
        # the input sents presents the index of words
        # this will convert each fact into a vector as output
        shape_input = sents.shape
        bach_size, n_sents, n_words = shape_input
        positional_encode_matrix = self.positional_encoding_scheme(n_words)
        p_e_m_shuffle = positional_encode_matrix.dimshuffle("x", "x", 0, 1)
        sents_emb = self.words_ind_2vec(sents) * p_e_m_shuffle
        return (sents_emb.sum(axis=2))


# Debug ONLY
if __name__ == "__main__":
    rng = np.random.RandomState(220495)
    arrSents = T.itensor3()
    nn = T.bscalar()
    EMBD = EncodingLayer(32, 10, rng=rng)
    Word2Vec = theano.function(inputs=[arrSents],
                               outputs=EMBD.sents_ind_2vec(arrSents))
    sents = [[[3, 14, 0], [0, 0, 0]], [[3, 14, 0], [1, 2, 6]]]
    Vec = Word2Vec(sents)
    print("Val: ", Vec)
    print("Dim: ", Vec.shape)
Exemple #23
0
def test_mlp(learning_rate=0.01, L1_reg=0.000, L2_reg=0.0002, n_epochs=10000,
             dataset='data_nn.csv', batch_size=15, n_hidden1=200, n_hidden2=100):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see
    regularization)

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


   """
    datasets, grad_test = load_data(dataset,'theano_labels.csv')

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    z = T.bscalar()

    rng = numpy.random.RandomState(1234)

    # construct the MLP class
    classifier = MLP(
        rng=rng,
        input=x,
        n_in=96,
        n_hidden1=n_hidden1,
        n_hidden2=n_hidden2,
        n_out=4,
        ts=z
    )

    # start-snippet-4
    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = (
        classifier.crossentropy(y)
        + L1_reg * classifier.L1
        + L2_reg * classifier.L2_sqr
    )
    # end-snippet-4

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size],
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size],
        }
    )

    # start-snippet-5
    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = [T.grad(cost, param) for param in classifier.params]
    #ginput = T.grad(cost, classifier.input)
    ginput = theano.gradient.jacobian(cost, classifier.input)
    #J, updates = theano.scan(lambda i, y,x : T.grad(y[i], x), sequences=T.arange(y.shape[0]), non_sequences=[y,x])
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs

    # given two lists of the same length, A = [a1, a2, a3, a4] and
    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
    # element is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    updates = [
        (param, param - learning_rate * gparam)
        for param, gparam in zip(classifier.params, gparams)
    ]

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
        }
    )

    #get_gradient = theano.function(
    #outputs=ginput,
    #givens={
    #    x: train_set_x[0],
    #    y: train_set_y[0],
    #}
    #)
    # end-snippet-5

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            classifier.ts = True
            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                classifier.ts = False
                validation_losses = [validate_model(i) for i
                                     in range(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if (
                        this_validation_loss < best_validation_loss *
                        improvement_threshold
                    ):
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    classifier.ts = False
                    test_losses = [test_model(i) for i
                                   in range(n_test_batches)]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(('The code for file ' +
           os.path.split(__file__)[1] +
           ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)


    f = theano.function([classifier.input, y], ginput)
    print (pp(f.maker.fgraph.outputs[0]))
    theano.printing.pydotprint(f.maker.fgraph.outputs[0])
    test_in = train_set = theano.shared([-0.85932366,  0.58308557,  0.57291266, -2.73567018,  0.5720682 ,
            0.69788969,  0.01782218,  0.89483408,  3.28290884, -0.38769847,
           -0.76087236, -0.50285087, -1.24656495, -0.73529842, -0.99193669,
           -1.95702179,  1.57430759,  0.24463588,  3.06210202,  2.45264677,
           -0.25134517, -0.04829522, -0.55535032,  0.0503641 , -1.91432708,
            0.77470853,  0.7401515 , -2.71915318,  0.4963475 ,  1.00522374,
            0.27958163, -1.35574041,  0.58434732, -0.67177877, -1.07827181,
           -2.11205369, -1.48408336, -0.80823029, -0.95141343, -1.98320406,
            1.46513513, -0.09303374, -0.76959049,  0.85930122, -0.86362607,
           -0.78979288, -0.0444948 ,  0.45982332, -2.31018994,  0.85091726,
            0.77935362, -2.70620804,  0.44422539,  1.24119296,  0.09150836,
           -2.86868139, -1.16801813, -0.50896104, -0.05604379, -0.57235696,
           -1.08455522, -1.17935154, -1.12812324, -1.9744183 ,  0.19983282,
           -0.11654747, -1.15473115, -0.07447867, -0.28972877, -0.94642741,
            0.26084976,  0.46156281,  2.1851348 , -0.77191925, -0.73766559,
            1.8115434 ,  0.83390925, -0.91492798,  0.06507779,  2.07655773,
            2.62112977,  0.04236459, -0.34407471, -0.03113814,  0.67895545,
            1.1023399 ,  0.77840311,  1.18688628,  1.31362216,  0.86287225,
            2.23127128,  1.32033075,  0.07084121,  0.45882767, -0.52361762,
           -0.24316931])
    test_out = theano.shared(numpy.asarray(4,dtype=theano.config.floatX))
    print (f([test_in,test_out]))
Exemple #24
0
def sgd_optimization_mnist(learning_rate=0.2, n_epochs=1000, batch_size=5):

    datasets = load_data()

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    index = T.bscalar()
    x = T.matrix('x')
    y = T.ivector('y')

    classifier = LogisticRegression(input=x, n_in=img_size, n_out=9)

    cost = classifier.negative_log_likelihood(y)

    test_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size: (index + 1) * batch_size]})

    validate_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]})

    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]


    train_model = theano.function(inputs=[index],
            outputs=cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training the model'
    patience = 5000
    patience_increase = 2
    improvement_threshold = 0.995
    validation_frequency = min(n_train_batches, patience / 2)

    best_params = None
    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:

                validation_losses = [validate_model(i)
                                     for i in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                    (epoch, minibatch_index + 1, n_train_batches,
                    this_validation_loss * 100.))


                if this_validation_loss < best_validation_loss:
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss

                    test_losses = [test_model(i)
                                   for i in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of best'
                       ' model %f %%') %
                        (epoch, minibatch_index + 1, n_train_batches,
                         test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(('Optimization complete with best validation score of %f %%,'
           'with test performance %f %%') %
                 (best_validation_loss * 100., test_score * 100.))
    print 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.1fs' % ((end_time - start_time)))
Exemple #25
0
import theano
import theano.tensor as tensor

#global variables to use to toggle training and rng, etc
from theano.sandbox.rng_mrg import MRG_RandomStreams
layer_train_rng = MRG_RandomStreams()
layer_train_enable = tensor.bscalar()
layer_train_epoch = tensor.iscalar()
layer_train_it = theano.shared(0)


def get_rng():
    global layer_train_rng
    return layer_train_rng


def set_rng_seed(v):
    global layer_train_rng
    layer_train_rng.seed(v)


def get_train():
    global layer_train_enable
    return layer_train_enable


def get_epoch():
    global layer_train_epoch
    return layer_train_epoch

Exemple #26
0
    def __init__(self,
                 dataset,
                 learning_rate=0.001,
                 decrease_constant=0,
                 hidden_sizes=[500],
                 random_seed=1234,
                 batch_size=1,
                 hidden_activation=T.nnet.sigmoid,
                 use_cond_mask=False,
                 direct_input_connect="None",
                 direct_output_connect=False,
                 update_rule="None",
                 dropout_rate=0,
                 weights_initialization="Uniform",
                 mask_distribution=0):

        input_size = dataset['input_size']
        self.shuffled_once = False

        class SeedGenerator(object):
            # This subclass purpose is to maximize randomness and still keep reproducibility

            def __init__(self, random_seed):
                self.rng = np.random.mtrand.RandomState(random_seed)

            def get(self):
                return self.rng.randint(42424242)

        self.seed_generator = SeedGenerator(random_seed)

        self.trng = T.shared_randomstreams.RandomStreams(
            self.seed_generator.get())

        weights_initialization = getattr(
            WeightsInitializer(self.seed_generator.get()),
            weights_initialization
        )  # Get the weights initializer by string name

        # Building the model's graph
        input = T.matrix(name="input")
        target = T.matrix(name="target")
        is_train = T.bscalar(name="is_train")

        # Initialize the mask
        self.mask_generator = MaskGenerator(input_size, hidden_sizes,
                                            mask_distribution,
                                            self.seed_generator.get())

        # Initialize layers
        input_layer = ConditionningMaskedLayer(
            layerIdx=0,
            input=input,
            n_in=input_size,
            n_out=hidden_sizes[0],
            activation=hidden_activation,
            weights_initialization=weights_initialization,
            mask_generator=self.mask_generator,
            use_cond_mask=use_cond_mask)
        self.layers = [
            dropoutLayerDecorator(input_layer, self.trng, is_train,
                                  dropout_rate)
        ]
        # Now the hidden layers
        for i in range(1, len(hidden_sizes)):
            previous_layer = self.layers[i - 1]
            hidden_layer = DirectInputConnectConditionningMaskedLayer(
                layerIdx=i,
                input=previous_layer.output,
                n_in=hidden_sizes[i - 1],
                n_out=hidden_sizes[i],
                activation=hidden_activation,
                weights_initialization=weights_initialization,
                mask_generator=self.mask_generator,
                use_cond_mask=use_cond_mask,
                direct_input=input if direct_input_connect == "Full"
                and previous_layer.output != input else None)
            self.layers += [
                dropoutLayerDecorator(hidden_layer, self.trng, is_train,
                                      dropout_rate)
            ]
        # And the output layer
        outputLayerIdx = len(self.layers)
        previous_layer = self.layers[outputLayerIdx - 1]
        self.layers += [
            DirectOutputInputConnectConditionningMaskedOutputLayer(
                layerIdx=outputLayerIdx,
                input=previous_layer.output,
                n_in=hidden_sizes[outputLayerIdx - 1],
                n_out=input_size,
                activation=T.nnet.sigmoid,
                weights_initialization=weights_initialization,
                mask_generator=self.mask_generator,
                use_cond_mask=use_cond_mask,
                direct_input=input if (direct_input_connect == "Full"
                                       or direct_input_connect == "Output")
                and previous_layer.output != input else None,
                direct_outputs=[
                    (layer.layer_idx, layer.n_in, layer.input)
                    for layerIdx, layer in enumerate(self.layers[1:-1])
                ] if direct_output_connect else [])
        ]

        # The loss function
        output = self.layers[-1].output
        pre_output = self.layers[-1].lin_output
        log_prob = -T.sum(T.nnet.softplus(-target * pre_output +
                                          (1 - target) * pre_output),
                          axis=1)
        loss = (-log_prob).mean()

        # How to update the parameters
        self.parameters = [
            param for layer in self.layers for param in layer.params
        ]
        parameters_gradient = T.grad(loss, self.parameters)

        # Initialize update_rule
        if update_rule == "None":
            self.update_rule = DecreasingLearningRate(learning_rate,
                                                      decrease_constant)
        elif update_rule == "adadelta":
            self.update_rule = AdaDelta(decay=decrease_constant,
                                        epsilon=learning_rate)
        elif update_rule == "adagrad":
            self.update_rule = AdaGrad(learning_rate=learning_rate)
        elif update_rule == "rmsprop":
            self.update_rule = RMSProp(learning_rate=learning_rate,
                                       decay=decrease_constant)
        elif update_rule == "adam":
            self.update_rule = Adam(learning_rate=learning_rate)
        elif update_rule == "adam_paper":
            self.update_rule = Adam_paper(learning_rate=learning_rate)
        updates = self.update_rule.get_updates(
            zip(self.parameters, parameters_gradient))

        # How to to shuffle weights
        masks_updates = [
            layer_mask_update for layer in self.layers
            for layer_mask_update in layer.shuffle_update
        ]
        self.update_masks = theano.function(name='update_masks',
                                            inputs=[],
                                            updates=masks_updates)
        #
        # Functions to train and use the model
        index = T.lscalar()
        self.learn = theano.function(
            name='learn',
            inputs=[index, is_train],
            outputs=loss,
            updates=updates,
            givens={
                input:
                dataset['train']['data'][index * batch_size:(index + 1) *
                                         batch_size],
                target:
                dataset['train']['data'][index * batch_size:(index + 1) *
                                         batch_size]
            },
            on_unused_input='ignore')  # ignore for when dropout is absent

        self.use = theano.function(
            name='use',
            inputs=[input, is_train],
            outputs=output,
            on_unused_input='ignore')  # ignore for when dropout is absent

        # Test functions
        self.valid_log_prob = theano.function(
            name='valid_log_prob',
            inputs=[is_train],
            outputs=log_prob,
            givens={
                input: dataset['valid']['data'],
                target: dataset['valid']['data']
            },
            on_unused_input='ignore')  # ignore for when dropout is absent
        self.train_log_prob = theano.function(
            name='train_log_prob',
            inputs=[is_train],
            outputs=log_prob,
            givens={
                input: dataset['train']['data'],
                target: dataset['train']['data']
            },
            on_unused_input='ignore')  # ignore for when dropout is absent
        self.train_log_prob_batch = theano.function(
            name='train_log_prob_batch',
            inputs=[index, is_train],
            outputs=log_prob,
            givens={
                input:
                dataset['train']['data'][index * 1000:(index + 1) * 1000],
                target:
                dataset['train']['data'][index * 1000:(index + 1) * 1000]
            },
            on_unused_input='ignore')  # ignore for when dropout is absent
        self.test_log_prob = theano.function(
            name='test_log_prob',
            inputs=[is_train],
            outputs=log_prob,
            givens={
                input: dataset['test']['data'],
                target: dataset['test']['data']
            },
            on_unused_input='ignore')  # ignore for when dropout is absent

        # Functions for verify gradient
        self.useloss = theano.function(
            name='useloss',
            inputs=[input, target, is_train],
            outputs=loss,
            on_unused_input='ignore')  # ignore for when dropout is absent
        self.learngrad = theano.function(
            name='learn',
            inputs=[index, is_train],
            outputs=parameters_gradient,
            givens={
                input:
                dataset['train']['data'][index * batch_size:(index + 1) *
                                         batch_size],
                target:
                dataset['train']['data'][index * batch_size:(index + 1) *
                                         batch_size]
            },
            on_unused_input='ignore')  # ignore for when dropout is absent
def test_mlp(
    learning_rate=0.01,
    L1_reg=0.00,
    L2_reg=0.0001,
    cor_reg=0.00,
    cor_scaling=1.0,
    rand_seed=1234,
    dropout=False,
    n_epochs=1000,
    dataset="mnist.pkl.gz",
    batch_size=20,
    n_hidden=500,
    save_correlations=False,
):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see
    regularization)

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print "... building the model"

    # allocate symbolic variables for the data
    cor_reg_var = theano.shared(cor_reg)  # symbolic variable storing cor_reg value
    alpha = T.dscalar("alpha")  # scaling factor for weight decay
    is_train = T.bscalar("is_train")  # boolean for switching between training and prediction
    index = T.lscalar()  # index to a [mini]batch
    perm = T.lvector()  # permutation of the indices of the training samples
    x = T.matrix("x")  # the data is presented as rasterized images
    y = T.ivector("y")  # the labels are presented as 1D vector of
    # [int] labels

    rng = np.random.RandomState(rand_seed)

    # construct the MLP class
    classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10, dropout=dropout)

    # start-snippet-4
    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    if cor_reg == 0:
        cost = classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr
    else:
        cost = (
            classifier.negative_log_likelihood(y)
            + L1_reg * classifier.L1
            + L2_reg * classifier.L2_sqr
            + cor_reg_var * classifier.cor_sqr_sum
        )
    # end-snippet-4

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    if save_correlations:
        validate_model = theano.function(
            inputs=[index],
            outputs=[classifier.errors(y), classifier.activation_correlation],
            givens={
                x: valid_set_x[index * batch_size : (index + 1) * batch_size],
                y: valid_set_y[index * batch_size : (index + 1) * batch_size],
                is_train: np.cast["int8"](0),
            },
        )
    else:
        validate_model = theano.function(
            inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: valid_set_x[index * batch_size : (index + 1) * batch_size],
                y: valid_set_y[index * batch_size : (index + 1) * batch_size],
                is_train: np.cast["int8"](0),
            },
        )

    # start-snippet-5
    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = [T.grad(cost, param) for param in classifier.params]

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs

    # given two lists of the same length, A = [a1, a2, a3, a4] and
    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
    # element is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)]

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index, perm],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[perm[index * batch_size : (index + 1) * batch_size]],
            y: train_set_y[perm[index * batch_size : (index + 1) * batch_size]],
            is_train: np.cast["int8"](1),
        },
    )
    # end-snippet-5

    # update the symbolic cor_reg variable
    update_cor_reg = theano.function(inputs=[alpha], outputs=cor_reg_var, updates=[(cor_reg_var, cor_reg_var * alpha)])

    ###############
    # TRAIN MODEL #
    ###############
    print "... training"

    best_validation_loss = np.inf
    best_epoch = 0
    start_time = timeit.default_timer()

    # Open file for writing validation losses, and write the header
    valid_loss_filename = "ValidationLoss_Epoch%i_Batch%i_Cor%f_Drop%i_Scale%f.csv" % (
        n_epochs,
        n_epochs * n_train_batches,
        cor_reg,
        dropout,
        cor_scaling,
    )
    valid_loss_filepath = os.path.join(os.path.split(__file__)[0], "..", "output", "MLP", valid_loss_filename)
    valid_loss_outfile = open(valid_loss_filepath, "w")
    valid_loss_outfile.write("Epoch,Iteration,Error\n")
    if save_correlations:
        flat_corr_filename = "FlatCorrelations_Epoch%i_Batch%i_Cor%f_Drop%i_Scale%f.csv" % (
            n_epochs,
            n_epochs * n_train_batches,
            cor_reg,
            dropout,
            cor_scaling,
        )
        flat_corr_filepath = os.path.join(os.path.split(__file__)[0], "..", "output", "MLP", flat_corr_filename)
        flat_corr_outfile = open(flat_corr_filepath, "w")

    epoch = 0
    while epoch < n_epochs:
        epoch += 1
        index_perm = rng.permutation(train_set_x.get_value(borrow=True).shape[0])  # generate new permutation of indices

        # perform 1 epoch of training
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index, index_perm)

        print "Hidden layer after training:\n"
        print classifier.hiddenLayer.output.get_value()

        # compute zero-one loss on validation set
        if save_correlations:  # compute and save the average pairwise correlations
            validation_losses = []
            mean_correlations = 0  # contains mean correlation matrix once loop is finished
            for i in xrange(n_valid_batches):
                valid_loss, valid_corr = validate_model(i)
                validation_losses.append(valid_loss)
                mean_correlations += 1.0 * valid_corr / n_valid_batches  # iteratively constructs mean to save memory
            this_validation_loss = np.mean(validation_losses)
            flat_mean_correlation = flatten_correlation_matrix(mean_correlations)
            flat_corr_outfile.write(str(epoch) + "," + ",".join(map(str, flat_mean_correlation)) + "\n")
        else:
            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
            this_validation_loss = np.mean(validation_losses)

        # Write this epoch's validation error to the file
        valid_loss_outfile.write(("%i,%i,%f\n") % (epoch, epoch * n_train_batches, this_validation_loss))

        # ********COMMENT THIS OUT WHEN RUNNING MULTIPLE PARAMS OVERNIGHT********
        print (
            "epoch %i (iteration %i), validation error %f %%, cor_reg %f"
            % (epoch, epoch * n_train_batches, this_validation_loss * 100.0, cor_reg_var.get_value())
        )
        # current_time = timeit.default_timer()
        # print('epoch %i (iteration %i), validation error %f %%, cor_reg %f, time elapsed %.2fm' % (epoch, epoch*n_train_batches, this_validation_loss * 100., cor_reg_var.get_value(), (current_time - start_time) / 60.))
        print "Hidden layer after validation:\n"
        print classifier.hiddenLayer.output.get_value()

        # if we got the best validation score until now
        if this_validation_loss < best_validation_loss:
            best_validation_loss = this_validation_loss
            best_epoch = epoch

        # Update the value of cor_reg for the next epoch
        # Only makes a difference when cor_scaling != 1, because multiplication
        if cor_scaling != 1:
            old_cor_reg = update_cor_reg(cor_scaling)

    valid_loss_outfile.close()
    if save_correlations:
        flat_corr_outfile.close()

    end_time = timeit.default_timer()
    print (
        ("Optimization complete. Best validation score of %f %% " "obtained following epoch %i (iteration %i)")
        % (best_validation_loss * 100.0, best_epoch, best_epoch * n_train_batches)
    )

    print "Training process ran for %.2fm" % ((end_time - start_time) / 60.0)
Exemple #28
0
 def __init__(self):
     super(DropoutModel, self).__init__()
     self.is_train = T.bscalar('is_train')
Exemple #29
0
    def optimiser(self, num_samples, update, update_kwargs, saved_update=None):

        latent_dim = T.bscalar('latent_dim')

        batch = T.matrix('batch')
        batch_rep = T.repeat(batch, num_samples, axis=0)

        h_regular_rep = self.recognition_model.get_samples_latents_regular(
            batch_rep)
        h_overdisp_rep = self.recognition_model.get_samples_latents_overdisp(
            batch_rep, latent_dim)
        h_rep = T.set_subtensor(h_regular_rep[:, latent_dim], h_overdisp_rep)

        log_p_h = self.generative_model.log_p_h(h_rep)
        log_p_x = self.generative_model.log_p_x(h_rep, batch_rep)
        entropies_h = self.recognition_model.entropies_latents(
            h_rep, batch_rep)

        imp_wts = self.recognition_model.importance_weights_latents(
            h_overdisp_rep, batch_rep, latent_dim)

        elbos_rep = imp_wts * (log_p_h + log_p_x + entropies_h)
        elbos_matrix = elbos_rep.reshape((batch.shape[0], num_samples))

        elbo = T.sum(T.mean(elbos_matrix, axis=1, keepdims=True))

        params = self.generative_model.get_params(
        ) + self.recognition_model.get_params()[:-1]
        grads = T.grad(-elbo, params)

        tau = self.recognition_model.get_params()[-1]

        all_grads, _ = theano.scan(
            lambda s, E: T.grad(-T.sum(E[s]), params),
            sequences=[T.arange(elbos_matrix.T.shape[0])],
            non_sequences=[elbos_matrix.T],
        )

        variance = T.sum([T.sum(T.var(g, axis=0)) for g in all_grads])

        grad_tau = T.grad(variance, tau)

        grads += [grad_tau]

        params = self.generative_model.get_params(
        ) + self.recognition_model.get_params()

        update_kwargs['loss_or_grads'] = grads
        update_kwargs['params'] = params

        updates = update(**update_kwargs)

        if saved_update is not None:
            for u, v in zip(updates, saved_update.keys()):
                u.set_value(v.get_value())

        optimiser = theano.function(
            inputs=[batch, latent_dim],
            outputs=elbo,
            updates=updates,
        )

        return optimiser, updates
Exemple #30
0
    def __init__(self, rng, n_in, n_out, n_h, n_layers, f_act=leaky_relu, obj='single', dropout_rate = 0):
        '''
        :param rng: Numpy RandomState
        :param n_in: Input dimension (int)
        :param n_out: Output dimension (int)
        :param n_h: Hidden dimension (int)
        :param n_layers: Number of hidden layers (int)
        :param f_act: Hidden-to-hidden activation function
        :param f_out: Output activation function
        '''
        if obj=='single':
            f_out = softmax
        elif obj=='multi':
            f_out = sigmoid
        self.x = T.vector()

        # construct hidden layers
        assert(n_layers>=1)
        first_hiddenLayer = HiddenLayer(
            rng=rng,
            input=self.x,
            predict_input=self.x,
            n_in=n_in,
            n_out=n_h,
            activation=f_act,
            dropout_rate = dropout_rate,
            nametag='0'
        )

        self.hidden_layers = [first_hiddenLayer]
        self.p = first_hiddenLayer.params[:]

        for i in range(n_layers-1):
            cur_hiddenLayer = ResNetLayer(
                rng=rng,
                input=self.hidden_layers[-1].output,
                predict_input=self.hidden_layers[-1].predict_output,
                n_h=n_h,
                activation=f_act,
                dropout_rate = dropout_rate,
                nametag=str(i+1)
                )
            self.hidden_layers.append(cur_hiddenLayer)
            self.p.extend(cur_hiddenLayer.params[:])

        # params for output layer

        self.outputLayer = HiddenLayer(
            rng=rng,
            input=self.hidden_layers[-1].output,
            predict_input=self.hidden_layers[-1].predict_output,
            n_in=n_h,
            n_out=n_out,
            activation=f_out,
            dropout_rate = 0,
            nametag='o'
        )
        self.p.extend(self.outputLayer.params[:])

        self.n_layers = n_layers + 1
        self.obj = obj
        if obj=='single':
            self.y = T.bscalar('y')
            self.o = self.outputLayer.output
            self.cost = T.nnet.categorical_crossentropy(self.o, T.eye(n_out)[self.y])
            self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.)
            self.prediction = np.argmax(self.o)
        elif obj=='multi':
            self.y = T.bvector('y')
            self.o = self.outputLayer.output
            self.cost = T.nnet.binary_crossentropy(self.o, self.y).mean()
            self.prediction = T.argsort(self.o)
            self.accuracy = self.y[T.argmax(self.o)]
            self.accuracy3 = (1.0/3.0) * (self.y[self.prediction[-3]]+self.y[self.prediction[-2]]+self.y[self.prediction[-1]])
            self.accuracy5 = (1.0/5.0) * (self.y[self.prediction[-5]]+self.y[self.prediction[-4]]+self.y[self.prediction[-3]]+self.y[self.prediction[-2]]+self.y[self.prediction[-1]])

        self.optimiser = sgd_optimizer(self, 'ResNet')
Exemple #31
0
    def SGD(self,
            training_data,
            no_improvement_in,
            mini_batch_size,
            eta,
            validation_data,
            test_data,
            lmbda=0.0,
            monitor_test=False):
        """Train the network using mini-batch stochastic gradient descent."""
        training_x, training_y = training_data
        validation_x, validation_y = validation_data
        test_x, test_y = test_data

        # compute number of minibatches for training, validation and testing
        num_training_batches = int(size(training_data) / mini_batch_size)
        num_validation_batches = int(size(validation_data) / mini_batch_size)
        num_test_batches = int(size(test_data) / mini_batch_size)

        # define the (regularized) cost function, symbolic gradients, and updates
        l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers])
        cost = self.layers[-1].cost(self)+\
               0.5*lmbda*l2_norm_squared/num_training_batches
        grads = T.grad(cost, self.params)
        updates = [(param, param - eta * grad)
                   for param, grad in zip(self.params, grads)]

        # define functions to train a mini-batch, and to compute the
        # accuracy in validation and test mini-batches.
        i = T.lscalar()  # mini-batch index
        n_class = T.bscalar()  # number of the class
        train_mb = theano.function(
            [i],
            cost,
            updates=updates,
            givens={
                self.x:
                training_x[i * self.mini_batch_size:(i + 1) *
                           self.mini_batch_size],
                self.y:
                training_y[i * self.mini_batch_size:(i + 1) *
                           self.mini_batch_size]
            })
        validate_mb_accuracy = theano.function(
            [i],
            self.layers[-1].accuracy(self.y),
            givens={
                self.x:
                validation_x[i * self.mini_batch_size:(i + 1) *
                             self.mini_batch_size],
                self.y:
                validation_y[i * self.mini_batch_size:(i + 1) *
                             self.mini_batch_size]
            })
        test_mb_accuracy = theano.function(
            [i],
            self.layers[-1].accuracy(self.y),
            givens={
                self.x:
                test_x[i * self.mini_batch_size:(i + 1) *
                       self.mini_batch_size],
                self.y:
                test_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size]
            })
        test_mb_accuracies_by_class = theano.function(
            [i, n_class],
            self.layers[-1].accuracies_by_class(self.y, n_class),
            givens={
                self.x:
                test_x[i * self.mini_batch_size:(i + 1) *
                       self.mini_batch_size],
                self.y:
                test_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size]
            },
            on_unused_input='ignore')
        test_mb_per_by_class = theano.function(
            [i, n_class],
            self.layers[-1].per_by_class(self.y, n_class),
            givens={
                self.x:
                test_x[i * self.mini_batch_size:(i + 1) *
                       self.mini_batch_size],
                self.y:
                test_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size]
            },
            on_unused_input='ignore')
        self.test_mb_predictions = theano.function(
            [i],
            self.layers[-1].y_out,
            givens={
                self.x:
                test_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size]
            })
        # Do the actual training
        best_test_accuracy = 0.0
        best_validation_accuracy = 0.0
        epoch = 0
        it = -1
        #Keep trace of the test accuracy
        if monitor_test:
            test_acc = []
            test_acc_by_class = []
        while True:
            it += 1
            epoch += 1
            for minibatch_index in range(num_training_batches):
                iteration = num_training_batches * epoch + minibatch_index
                cost_ij = train_mb(minibatch_index)
                if (iteration + 1) % num_training_batches == 0:
                    validation_accuracy = np.mean([
                        validate_mb_accuracy(j)
                        for j in range(num_validation_batches)
                    ])
                    if validation_accuracy >= best_validation_accuracy:
                        #print("This is the best total validation accuracy to date.")
                        best_validation_accuracy = validation_accuracy
                        best_iteration = iteration
                        test_accuracies_by_class = [0] * self.layers[-1].n_out
                        test_per_by_class = [0] * self.layers[-1].n_out
                        it = -1
                    if test_data:
                        test_accuracy = np.mean([
                            test_mb_accuracy(j)
                            for j in range(num_test_batches)
                        ])
                        if monitor_test:
                            test_acc.append(test_accuracy)
                        if test_accuracy > best_test_accuracy:
                            best_test_accuracy = test_accuracy
                            best_iteration_acc = iteration
                        # If you want to track visually your progress, uncomment the following lines:
                        #print("Epoch {0}: test accuracy {1:.2%}".format(
                        #       epoch, test_accuracy))
                        if monitor_test:
                            for i in range(self.layers[-1].n_out):
                                test_accuracies_by_class[i] = np.mean([test_mb_accuracies_by_class(j,i) for j in range(num_test_batches)]) / \
                                                              np.mean([test_mb_per_by_class(j,i) for j in range(num_test_batches)])
                            test_acc_by_class.append(test_accuracies_by_class)

            if it >= no_improvement_in:
                break
        print("Finished training network after {} epochs.".format(epoch))
        print("Best test accuracy of {0:.2%} obtained at iteration {1}".format(
            best_test_accuracy, best_iteration_acc))
        print("Best validation accuracy of {0:.2%} obtained at iteration {1}".
              format(best_validation_accuracy, best_iteration))

        if monitor_test:
            return test_acc, np.transpose(test_acc_by_class)
    def __init__(self,
                 rng,
                 n_in,
                 n_out,
                 n_h,
                 f_act=leaky_relu,
                 f_out=softmax,
                 orth_init=True,
                 dropout_rate=0,
                 obj='c'):
        '''
        :param rng: Numpy RandomState
        :param n_in: Input dimension (int)
        :param n_out: Output dimension (int)
        :param n_h: Hidden dimension (int)
        :param f_act: Hidden-to-hidden activation function
        :param f_out: Output activation function
        :param orth_init: if true, the initialize transition matrix to be orthogonal (bool)
        :param dropout_rate: dropout rate (float)
        :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r'])
        '''
        if orth_init:
            Whh_ = rvs(rng, n_h)
        else:
            Whh_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)),
                               np.sqrt(6. / (n_h + n_h)), (n_h, n_h))

        Whi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)),
                           np.sqrt(6. / (n_in + n_h)), (n_h, n_in))
        bh_ = np.zeros(n_h)
        Woh_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)),
                           np.sqrt(6. / (n_h + n_out)), (n_out, n_h))
        bo_ = np.zeros(n_out)
        h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)),
                          n_h)

        # Theano: Created shared variables
        Whh = theano.shared(name='Whh',
                            value=Whh_.astype(theano.config.floatX))
        Whi = theano.shared(name='Whi',
                            value=Whi_.astype(theano.config.floatX))
        bh = theano.shared(name='bh', value=bh_.astype(theano.config.floatX))
        Woh = theano.shared(name='Woh',
                            value=Woh_.astype(theano.config.floatX))
        bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX))
        h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX))

        self.p = [Whh, Whi, Woh, bh, bo, h0]

        seq_len = T.iscalar('seq_len')
        self.seq_len = seq_len
        self.dropout_rate = dropout_rate
        self.x = T.vector()
        x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2)

        if dropout_rate > 0:
            np.random.seed(int(time.time()))

            # for training
            def masked_forward_prop_step(x_t, h_t_prev):
                h_t = f_act(Whi.dot(x_t) + Whh.dot(h_t_prev) + bh)
                o_t = Woh.dot(h_t) + bo
                mask = np.random.binomial(np.ones(n_h, dtype=int),
                                          1 - dropout_rate)
                masked_h_t = h_t * T.cast(mask, theano.config.floatX)

                return [o_t, masked_h_t]

            # for testing
            def forward_prop_step(x_t, h_t_prev):
                h_t = f_act(Whi.dot(x_t) + Whh.dot(h_t_prev) + bh)
                o_t = Woh.dot(h_t) + bo
                h_t = (1.0 - dropout_rate) * h_t
                return [o_t, h_t]

            [o_train, _], _ = theano.scan(masked_forward_prop_step,
                                          sequences=[x_scan],
                                          outputs_info=[None, h0],
                                          n_steps=seq_len)

            [o_test, _], _ = theano.scan(forward_prop_step,
                                         sequences=[x_scan],
                                         outputs_info=[None, h0],
                                         n_steps=seq_len)

        else:

            def forward_prop_step(x_t, h_t_prev):
                h_t = f_act(Whi.dot(x_t) + Whh.dot(h_t_prev) + bh)
                o_t = Woh.dot(h_t) + bo
                return [o_t, h_t]

            [o_train, _], _ = theano.scan(forward_prop_step,
                                          sequences=[x_scan],
                                          outputs_info=[None, h0],
                                          n_steps=seq_len)
            o_test = o_train

        if obj == 'c':  # classification task
            self.y = T.bscalar('y')
            self.o_train = f_out(o_train[-1])
            self.o_test = f_out(o_test[-1])
            #obj function to compute grad, use dropout
            self.cost = T.nnet.categorical_crossentropy(
                self.o_train,
                T.eye(n_out)[self.y])
            #compute accuracy use average of dropout rate
            self.accuracy = T.switch(T.eq(T.argmax(self.o_test), self.y), 1.,
                                     0.)
            self.prediction = np.argmax(self.o_test)
        elif obj == 'r':  # regression task
            self.y = T.dscalar('y')
            self.o_train = o_train[-1]
            self.o_test = o_test[-1]
            #obj function to compute grad, use dropout
            self.cost = (self.o_train[0] - self.y)**2
            #compute accuracy use average of dropout rate
            self.accuracy = (self.o_test[0] - self.y)**2
            self.prediction = self.o_test[0]

        _, self.Sigma, _ = T.nlinalg.SVD(full_matrices=1,
                                         compute_uv=1)(self.p[0])
        self.max_singular = T.max(self.Sigma)
        self.min_singular = T.min(self.Sigma)

        self.optimiser = sgd_optimizer(self, 'RNN')
Exemple #33
0
    def __init__(self,
                 rng,
                 n_in,
                 n_out,
                 n_h,
                 n_r,
                 margin=1.0,
                 sig_mean=1.0,
                 f_act=leaky_relu,
                 f_out=softmax,
                 obj='c'):
        '''
        :param rng: Numpy RandomState
        :param n_in: Input dimension (int)
        :param n_out: Output dimension (int)
        :param n_h: Hidden dimension (int)
        :param n_r: Number of reflection vectors (int)
        :param f_act: Hidden-to-hidden activation function
        :param f_out: Output activation function
        :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r'])
        '''
        U_ = np.tril(rng.normal(0, 0.01, (n_h, n_r)))
        norms_U_ = np.linalg.norm(U_, axis=0)
        U_ = 1. / norms_U_ * U_

        V_ = np.tril(rng.normal(0, 0.01, (n_h, n_r)))
        norms_V_ = np.linalg.norm(V_, axis=0)
        V_ = 1. / norms_V_ * V_

        #Sig_ = np.ones( n_h)
        P_ = np.zeros(n_h)

        Whi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)),
                           np.sqrt(6. / (n_in + n_h)), (n_h, n_in))
        bh_ = np.zeros(n_h)
        Woh_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)),
                           np.sqrt(6. / (n_h + n_out)), (n_out, n_h))
        bo_ = np.zeros(n_out)
        h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)),
                          n_h)

        # Theano: Created shared variables
        Whi = theano.shared(name='Whi',
                            value=Whi_.astype(theano.config.floatX))
        U = theano.shared(name='U', value=U_.astype(theano.config.floatX))
        V = theano.shared(name='V', value=V_.astype(theano.config.floatX))
        #Sig = theano.shared(name='Sig', value=Sig_.astype(theano.config.floatX))
        P = theano.shared(name='P', value=P_.astype(theano.config.floatX))
        bh = theano.shared(name='bh', value=bh_.astype(theano.config.floatX))
        Woh = theano.shared(name='Woh',
                            value=Woh_.astype(theano.config.floatX))
        bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX))
        h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX))

        #self.p = [U, V, Sig, Whi, Woh, bh, bo, h0]
        self.p = [U, V, P, Whi, Woh, bh, bo, h0]
        seq_len = T.iscalar('seq_len')
        self.seq_len = seq_len

        self.x = T.vector()
        #x_scan = T.shape_padright(self.x)
        x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2)
        if n_h != n_r:  # Number of reflection vectors is less than the hidden dimension

            def forward_prop_step(x_t, h_t_prev):
                Sig = 2 * margin * (sigmoid(P) - 0.5) + sig_mean
                h_t = f_act(Whi.dot(x_t) + svd_H_wy(U, V, Sig, h_t_prev) + bh)
                o_t = Woh.dot(h_t) + bo
                return [o_t, h_t]
        else:

            def forward_prop_step(x_t, h_t_prev):
                Sig = 2 * margin * (sigmoid(P) - 0.5) + sig_mean
                Hu1SigHv1 = T.set_subtensor(Sig[-1],
                                            Sig[-1] * U[-1, -1] * V[-1, -1])
                h_t = f_act(
                    Whi.dot(x_t) +
                    svd_H_wy(U[:, :-1], V[:, :-1], Hu1SigHv1, h_t_prev) + bh)
                o_t = Woh.dot(h_t) + bo
                return [o_t, h_t]

        [o_scan, _], _ = theano.scan(forward_prop_step,
                                     sequences=[x_scan],
                                     outputs_info=[None, h0],
                                     n_steps=seq_len)

        if obj == 'c':  # classification task
            self.y = T.bscalar('y')
            self.o = f_out(o_scan[-1])
            #obj function to compute grad, use dropout
            self.cost = T.nnet.categorical_crossentropy(
                self.o,
                T.eye(n_out)[self.y])
            #compute accuracy use average of dropout rate
            self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.)
            self.prediction = np.argmax(self.o)
        elif obj == 'r':  # regression task
            self.y = T.dscalar('y')
            self.o = o_scan[-1]
            #obj function to compute grad, use dropout
            self.cost = (self.o[0] - self.y)**2
            #compute accuracy use average of dropout rate
            self.accuracy = (self.o[0] - self.y)**2
            self.prediction = self.o[0]

        self.max_singular = 2 * margin * (sigmoid(T.max(self.p[2])) -
                                          0.5) + sig_mean
        self.min_singular = 2 * margin * (sigmoid(T.min(self.p[2])) -
                                          0.5) + sig_mean

        self.optimiser = sgd_optimizer(self, 'svdRNN')
 def __init__(self, n_features):
     self.n_features = n_features
     self.x = T.fvector("x")
     self.y = T.bscalar("y")
     self.W = theano.shared(rng.randn(n_features).astype(theano.config.floatX), name="W")
     self.b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
Exemple #35
0
    valid_x = theano.shared(valid_x, borrow=True)

    train_y = T.cast(theano.shared(train_y, borrow=True), dtype='int32')
    valid_y = T.cast(theano.shared(valid_y, borrow=True), dtype='int32')

    # allocate learning rate and momentum shared variables
    learning_rate = theano.shared(
        np.array(learning_rate_schedule[0], dtype=theano.config.floatX))
    momentum = theano.shared(
        np.array(momentum_schedule[0], dtype=theano.config.floatX))

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels
    dropout_active = T.bscalar(
        'dropout_active')  # a flag to enable and disable dropout

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print 'Building the model ...'

    # Reshape matrix of rasterized images of shape (batch_size, 48 * 48)
    # to a 4D tensor, compatible with our ConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 48, 48))

    # layer10 = ConvPoolLayer(
    #     rng,
    #     input=layer0_input,
    #     image_shape=(batch_size, 1, 98, 98),
    #     filter_shape=(nkerns[0], 1, 4, 4),
Exemple #36
0
                                               max_egs=int(opts.num_from_train),
                                               parse_mode=opts.parse_mode)
log("train_stats %s %s" % (len(train_x), train_stats))
dev_x, dev_y, dev_stats = util.load_data(opts.dev_set, vocab,
                                         update_vocab=False,
                                         max_egs=int(opts.num_from_dev),
                                         parse_mode=opts.parse_mode)
log("dev_stats %s %s" % (len(dev_x), dev_stats))

# input/output example vars
s1_idxs = T.ivector('s1')  # sequence for sentence one
s2_idxs = T.ivector('s2')  # sequence for sentence two
actual_y = T.ivector('y')  # single for sentence pair label; 0, 1 or 2

# dropout keep prob for post concat, pre MLP
apply_dropout = T.bscalar('apply_dropout')  # dropout.{APPLY_DROPOUT|NO_DROPOUT}
keep_prob = theano.shared(opts.keep_prob)  # recall 1.0 => noop
keep_prob = T.cast(keep_prob, 'float32')  # shared weirdity, how to set in init (?)

# keep track of different "layers" that handle their own gradients.
# includes rnns, final concat & softmax and, potentially, special handling for
# tied embeddings
layers = []

# decide set of sequence idxs we'll be processing. there will always the two
# for the forward passes over s1 and s2 and, optionally, two more for the
# reverse pass over s1 & s2 in the bidirectional case.
idxs = [s1_idxs, s2_idxs]
names = ["s1f", "s2f"]
if opts.bidirectional:
    idxs.extend([s1_idxs[::-1], s2_idxs[::-1]])
Exemple #37
0
    def __init__(self,
                 rng,
                 n_in,
                 n_per_base,
                 n_out,
                 n_layer=1,
                 basefuncs1=None,
                 basefuncs2=None,
                 gradient=None,
                 with_shortcuts=False):
        """Initialize the parameters for the multilayer function graph

		:type rng: numpy.random.RandomState
		:param rng: a random number generator used to initialize weights

		:type n_in: int
		:param n_in: number of input units, the dimension of the space in
		which the datapoints lie

		:type n_layer: int
		:param n_layer: number of hidden layers

		:type n_per_base: int
		:param n_per_base: number of nodes per basis function see FGLayer

		:type n_out: int
		:param n_out: number of output units, the dimension of the space in
		which the labels lie

		:type basefuncs1: [int]
		:param basefuncs1: see FGLayer

		:type basefuncs2: [int]
		:param basefuncs2: see FGLayer

		:type gradient: string
		:param gradient: type of gradient descent algo (None=="sgd+","adagrad","adadelta","nag")

		:type with_shortcuts: bool
		:param with_shortcuts: whether to use shortcut connections (output is connected to all units)

		"""
        self.input = T.matrix('input')  # the data is presented as vector input
        self.labels = T.matrix(
            'labels')  # the labels are presented as vector of continous values
        self.rng = rng
        self.n_layers = n_layer
        self.hidden_layers = []
        self.params = []
        self.n_in = n_in
        self.n_out = n_out
        self.with_shortcuts = with_shortcuts
        self.fixL0 = False

        for l in xrange(n_layer):
            if l == 0:
                layer_input = self.input
                n_input = n_in
            else:
                layer_input = self.hidden_layers[l - 1].output
                n_input = self.hidden_layers[l - 1].n_out

            hiddenLayer = FGLayer(
                rng=rng,
                inp=layer_input,
                n_in=n_input,
                n_per_base=n_per_base,
                basefuncs1=basefuncs1,
                basefuncs2=basefuncs2,
                layer_idx=l,
            )
            self.hidden_layers.append(hiddenLayer)
            self.params.extend(hiddenLayer.params)

        div_thresh = T.scalar("div_thresh")

        # The linear output layer, either it gets as input the output of ALL previous layers
        if self.with_shortcuts:
            output_layer_inp = T.concatenate(
                [l.output for l in reversed(self.hidden_layers)], axis=1)
            output_layer_n_in = sum([l.n_out for l in self.hidden_layers])
        else:  # or just of the last hidden layer
            output_layer_inp = self.hidden_layers[-1].output
            output_layer_n_in = self.hidden_layers[-1].n_out
        self.output_layer = DivisionRegression(rng=rng,
                                               inp=output_layer_inp,
                                               n_in=output_layer_n_in,
                                               n_out=n_out,
                                               div_thresh=div_thresh)

        self.params.extend(self.output_layer.params)

        self.evalfun = theano.function(
            inputs=[self.input, In(div_thresh, value=0.0001)],
            outputs=self.output_layer.output)

        L1_reg = T.scalar('L1_reg')
        L2_reg = T.scalar('L2_reg')
        fixL0 = T.bscalar('fixL0')
        self.L1 = self.output_layer.L1 + sum(
            [l.L1 for l in self.hidden_layers])
        self.L2_sqr = self.output_layer.L2_sqr + sum(
            [l.L2_sqr for l in self.hidden_layers])
        self.penalty = self.output_layer.penalty

        self.loss = self.output_layer.loss
        self.errors = self.loss
        self.cost = (self.loss(self.labels) + L1_reg * self.L1 +
                     L2_reg * self.L2_sqr + self.penalty)

        #Extrapol penalty
        self.extrapol_cost = self.output_layer.extrapol_loss

        learning_rate = T.scalar('learning_rate')

        def process_updates(par, newp):
            # print par.name
            if par.name == "W":
                # if fixL0 is True, then keep small weights at 0
                return par, ifelse(
                    fixL0, T.switch(T.abs_(par) < 0.001, par * 0, newp), newp)
            return par, newp

        print "Gradient:", gradient
        update = None
        if gradient == 'sgd+' or gradient == 'sgd' or gradient == None:
            gparams = [T.grad(self.cost, param) for param in self.params]
            update = OrderedDict([
                (param, param - (learning_rate * gparam).clip(-1.0, 1.0))
                for param, gparam in zip(self.params, gparams)
            ])
        elif gradient == 'adam':
            update = Lupdates.adam(self.cost,
                                   self.params,
                                   learning_rate,
                                   epsilon=1e-04)
        elif gradient == 'adadelta':
            update = Lupdates.adadelta(self.cost, self.params, learning_rate)
        elif gradient == 'rmsprop':
            update = Lupdates.rmsprop(self.cost, self.params, learning_rate)
        elif gradient == 'nag':
            update = Lupdates.nesterov_momentum(self.cost, self.params,
                                                learning_rate)
        else:
            assert ("unknown gradient " + gradient)

        #Extrapol sanity gradient computation:

        extrapol_updates = Lupdates.adam(self.extrapol_cost,
                                         self.params,
                                         learning_rate,
                                         epsilon=1e-04)

        updates = [process_updates(*up) for up in update.items()]
        self.train_model = theano.function(
            inputs=[
                self.input, self.labels, L1_reg, L2_reg, fixL0, learning_rate,
                div_thresh
            ],
            outputs=self.cost,
            updates=updates,
        )
        # avoid too large outputs in extrapolation domain
        self.remove_extrapol_error = theano.function(
            inputs=[self.input, learning_rate, div_thresh],
            outputs=self.extrapol_cost,
            updates=extrapol_updates,
        )

        self.test_model = theano.function(
            inputs=[self.input, self.labels,
                    In(div_thresh, value=0.0001)],
            outputs=self.errors(self.labels),
        )
        self.validate_model = theano.function(
            inputs=[self.input, self.labels,
                    In(div_thresh, value=0.0001)],
            outputs=self.errors(self.labels),
        )
        self.L1_loss = theano.function(
            inputs=[],
            outputs=self.L1,
        )
        self.MSE = theano.function(
            inputs=[self.input, self.labels,
                    In(div_thresh, value=0.0001)],
            outputs=self.errors(self.labels),
        )
Exemple #38
0
    def __init__(self, dataset,
                 learning_rate=0.001,
                 decrease_constant=0,
                 hidden_sizes=[500],
                 random_seed=1234,
                 batch_size=1,
                 hidden_activation=T.nnet.sigmoid,
                 use_cond_mask=False,
                 direct_input_connect="None",
                 direct_output_connect=False,
                 update_rule="None",
                 dropout_rate=0,
                 weights_initialization="Uniform",
                 mask_distribution=0):

        input_size = dataset['input_size']
        self.shuffled_once = False

        self.seed_generator = SeedGenerator(random_seed)

        self.trng = T.shared_randomstreams.RandomStreams(self.seed_generator.get())

        # Get the weights initializer by string name
        weights_initialization = getattr(
            WeightsInitializer(self.seed_generator.get()), weights_initialization)

        # Building the model's graph
        input = T.matrix(name="input")
        target = T.matrix(name="target")
        is_train = T.bscalar(name="is_train")

        # Initialize the mask
        self.mask_generator = MaskGenerator(
            input_size, hidden_sizes, mask_distribution, self.seed_generator.get())

        # Initialize layers
        input_layer = ConditionningMaskedLayer(layerIdx=0,
                                               input=input,
                                               n_in=input_size,
                                               n_out=hidden_sizes[0],
                                               activation=hidden_activation,
                                               weights_initialization=weights_initialization,
                                               mask_generator=self.mask_generator,
                                               use_cond_mask=use_cond_mask)
        self.layers = [dropoutLayerDecorator(input_layer, self.trng, is_train, dropout_rate)]
        # Now the hidden layers
        for i in range(1, len(hidden_sizes)):
            previous_layer = self.layers[i - 1]
            hidden_layer = DirectInputConnectConditionningMaskedLayer(layerIdx=i,
                                                                      input=previous_layer.output,
                                                                      n_in=hidden_sizes[i - 1],
                                                                      n_out=hidden_sizes[i],
                                                                      activation=hidden_activation,
                                                                      weights_initialization=weights_initialization,
                                                                      mask_generator=self.mask_generator,
                                                                      use_cond_mask=use_cond_mask,
                                                                      direct_input=input if direct_input_connect == "Full" and previous_layer.output != input else None)
            self.layers += [dropoutLayerDecorator(hidden_layer, self.trng, is_train, dropout_rate)]
        # And the output layer
        outputLayerIdx = len(self.layers)
        previous_layer = self.layers[outputLayerIdx - 1]
        self.layers += [DirectOutputInputConnectConditionningMaskedOutputLayer(layerIdx=outputLayerIdx,
                                                                               input=previous_layer.output,
                                                                               n_in=hidden_sizes[
                                                                                   outputLayerIdx - 1],
                                                                               n_out=input_size,
                                                                               activation=T.nnet.sigmoid,
                                                                               weights_initialization=weights_initialization,
                                                                               mask_generator=self.mask_generator,
                                                                               use_cond_mask=use_cond_mask,
                                                                               direct_input=input if (
                                                                                   direct_input_connect == "Full" or direct_input_connect == "Output") and previous_layer.output != input else None,
                                                                               direct_outputs=[(layer.layer_idx, layer.n_in, layer.input) for layerIdx, layer in enumerate(self.layers[1:-1])] if direct_output_connect else [])]

        # The loss function
        output = self.layers[-1].output
        pre_output = self.layers[-1].lin_output
        log_prob = - \
            T.sum(T.nnet.softplus(-target * pre_output + (1 - target) * pre_output), axis=1)
        # log_prob = T.sum(target * T.log(output) + (1 - target) * T.log(1 - output), axis=1)
        loss = (-log_prob).mean()

        # How to update the parameters
        self.parameters = [param for layer in self.layers for param in layer.params]
        parameters_gradient = T.grad(loss, self.parameters)

        # Initialize update_rule
        if update_rule == "None":
            self.update_rule = DecreasingLearningRate(learning_rate, decrease_constant)
        elif update_rule == "adadelta":
            self.update_rule = AdaDelta(decay=decrease_constant, epsilon=learning_rate)
        elif update_rule == "adagrad":
            self.update_rule = AdaGrad(learning_rate=learning_rate)
        elif update_rule == "rmsprop":
            self.update_rule = RMSProp(learning_rate=learning_rate, decay=decrease_constant)
        elif update_rule == "adam":
            self.update_rule = Adam(learning_rate=learning_rate)
        elif update_rule == "adam_paper":
            self.update_rule = Adam_paper(learning_rate=learning_rate)
        updates = self.update_rule.get_updates(list(zip(self.parameters, parameters_gradient)))

        # How to to shuffle weights
        masks_updates = [
            layer_mask_update for layer in self.layers for layer_mask_update in layer.shuffle_update]
        self.update_masks = theano.function(name='update_masks',
                                            inputs=[],
                                            updates=masks_updates)
        #
        # Functions to train and use the model
        index = T.lscalar()
        self.learn = theano.function(name='learn',
                                     inputs=[index, is_train],
                                     outputs=loss,
                                     updates=updates,
                                     givens={input: dataset['train']['data'][
                                         index * batch_size:(index + 1) * batch_size], target: dataset['train']['data'][index * batch_size:(index + 1) * batch_size]},
                                     on_unused_input='ignore')  # ignore for when dropout is absent

        self.use = theano.function(name='use',
                                   inputs=[input, is_train],
                                   outputs=output,
                                   on_unused_input='ignore')  # ignore for when dropout is absent

        # Test functions
        self.valid_log_prob = theano.function(name='valid_log_prob',
                                              inputs=[is_train],
                                              outputs=log_prob,
                                              givens={
                                                  input: dataset['valid']['data'], target: dataset['valid']['data']},
                                              on_unused_input='ignore')  # ignore for when dropout is absent
        self.train_log_prob = theano.function(name='train_log_prob',
                                              inputs=[is_train],
                                              outputs=log_prob,
                                              givens={
                                                  input: dataset['train']['data'], target: dataset['train']['data']},
                                              on_unused_input='ignore')  # ignore for when dropout is absent
        self.train_log_prob_batch = theano.function(name='train_log_prob_batch',
                                                    inputs=[index, is_train],
                                                    outputs=log_prob,
                                                    givens={input: dataset['train']['data'][
                                                        index * 1000:(index + 1) * 1000], target: dataset['train']['data'][index * 1000:(index + 1) * 1000]},
                                                    on_unused_input='ignore')  # ignore for when dropout is absent
        self.test_log_prob = theano.function(name='test_log_prob',
                                             inputs=[is_train],
                                             outputs=log_prob,
                                             givens={
                                                 input: dataset['test']['data'], target: dataset['test']['data']},
                                             on_unused_input='ignore')  # ignore for when dropout is absent

        # Functions for verify gradient
        self.useloss = theano.function(name='useloss',
                                       inputs=[input, target, is_train],
                                       outputs=loss,
                                       on_unused_input='ignore')  # ignore for when dropout is absent
        self.learngrad = theano.function(name='learn',
                                         inputs=[index, is_train],
                                         outputs=parameters_gradient,
                                         givens={input: dataset['train']['data'][
                                             index * batch_size:(index + 1) * batch_size], target: dataset['train']['data'][index * batch_size:(index + 1) * batch_size]},
                                         on_unused_input='ignore')  # ignore for when dropout is absent

        #
        # adding functions to extract embeddings from each layer
        self.embedding_funcs = [theano.function(name='embedding-{}'.format(i),
                                                inputs=[input, is_train],
                                                outputs=layer.output,
                                                # givens={input: dataset['train']['data'][
                                                # index * batch_size:(index + 1) * batch_size]},
                                                on_unused_input='ignore')
                                for i, layer in enumerate(self.layers[:-1])]

        #
        # NOTE: the predict method (for decoding) is possible only when there is no skip
        # connections to the output layer
        if direct_input_connect == 'None' and not direct_output_connect:
            print('No skip connections! defining decoding function')
            pred_threshold = T.vector()
            last_layer_embeddings = T.matrix(name="ll-embeddings")
            output_probs = T.matrix(name="output-probs")
            # T.dot(last_layer_embeddings, self.layers[-1].W) + self.layers[-1].b
            pred_probs = output
            predictions = T.switch(pred_probs < pred_threshold, 0, 1)
            thresholded_output = T.switch(output_probs < pred_threshold, 0, 1)

            self.predict_probs = theano.function(name='predict_probs',
                                                 inputs=[last_layer_embeddings, is_train],
                                                 outputs=pred_probs,
                                                 givens={self.layers[-1].input:
                                                         last_layer_embeddings},
                                                 on_unused_input='ignore')
            self.threshold_probs = theano.function(name='threshold_probs',
                                                   inputs=[output_probs, pred_threshold],
                                                   outputs=thresholded_output,
                                                   on_unused_input='ignore')
            self.predict_func = theano.function(name='predict',
                                                inputs=[last_layer_embeddings, pred_threshold],
                                                outputs=predictions,
                                                givens={self.layers[-1].input:
                                                        last_layer_embeddings},
                                                on_unused_input='ignore')
        else:
            self.predict_func = None
            print('Skip connections detected! decoding will fail!')
Exemple #39
0
"""Wrapper for creating Theano functions for training/inference mode"""

import theano
from theano import tensor as T

# 1 = training mode
# 0 = inference mode
train_mode = T.bscalar('train_mode')

def function(inputs=[], outputs=[], default_mode=0, **kwargs):
    inputs = list(inputs)
    outputs_list = list(outputs) if type(outputs) in (list, tuple) \
                   else [outputs]
    use_train_mode = train_mode in theano.gof.graph.ancestors(
            inputs + outputs_list)
    extra_args = [train_mode] if use_train_mode else []

    f = theano.function(
            list(inputs)+extra_args,
            outputs,
            on_unused_input='warn',
            **kwargs)
    def g(*args):
        if default_mode is None:
            # args[-1] is the train_mode value
            if use_train_mode:
                # f() includes train_mode, pass arguments directly
                return f(*args)
            else:
                # f() does not include train_mode, drop the last argument
                return f(*(args[:-1]))
Exemple #40
0
    def __init__(self, layer_sizes, n_samples, alpha, learning_rate, v_prior,
                 batch_size, X_train, y_train, N_train):

        layer_sizes = copy.copy(layer_sizes)
        layer_sizes[0] = layer_sizes[0] + 1
        print layer_sizes
        self.batch_size = batch_size
        self.N_train = N_train
        self.X_train = X_train
        self.y_train = y_train

        self.rate = learning_rate

        # We create the network

        self.network = network.Network(layer_sizes, n_samples, v_prior,
                                       N_train)

        # index to a batch

        index = T.lscalar()
        self.indexes = T.vector('index', dtype='int32')
        indexes_train = theano.shared(value=np.array(range(0, N_train),
                                                     dtype=np.int32),
                                      borrow=True)

        self.x = T.tensor3('x', dtype=theano.config.floatX)
        self.y = T.matrix('y', dtype=theano.config.floatX)
        self.lr = T.fscalar()

        # The logarithm of the values for the likelihood factors
        sampl = T.bscalar()
        self.fwpass = theano.function(outputs=self.network.output(
            self.x, False, samples=sampl, use_indices=False),
                                      inputs=[self.x, sampl],
                                      allow_input_downcast=True)

        ll_train = self.network.log_likelihood_values(self.x, self.y,
                                                      self.indexes, 0.0, 1.0)


        self.estimate_marginal_ll = (-1.0 * N_train / (self.x.shape[ 1 ] * alpha) * \
            T.sum(LogSumExp(alpha * (T.sum(ll_train, 2) - self.network.log_f_hat() - self.network.log_f_hat_z()), 0)+ \
                T.log(1.0 / n_samples)) - self.network.log_normalizer_q() - 1.0 * N_train / self.x.shape[ 1 ] * self.network.log_normalizer_q_z() + \
            self.network.log_Z_prior())

        # We create a theano function for updating q
        upd = adam(self.estimate_marginal_ll,
                   self.network.params,
                   indexes_train[index * batch_size:(index + 1) * batch_size],
                   self.rate,
                   rescale_local=np.float32(N_train / batch_size))

        self.process_minibatch = theano.function([ index], self.estimate_marginal_ll, \
            updates = upd, \
            givens = { self.x: T.tile(self.X_train[ index * batch_size: (index + 1) * batch_size] , [ n_samples, 1, 1 ]),
            self.y: self.y_train[ index * batch_size: (index + 1) * batch_size ],
            self.indexes: indexes_train[ index * batch_size : (index + 1) * batch_size ] })

        # We create a theano function for making predictions

        self.error_minibatch_train = theano.function(
            [index],
            T.sum((T.mean(
                self.network.output(self.x, self.indexes), 0,
                keepdims=True)[0, :, :] - self.y)**2) / layer_sizes[-1],
            givens={
                self.x:
                T.tile(
                    self.X_train[index * batch_size:(index + 1) * batch_size],
                    [n_samples, 1, 1]),
                self.y:
                self.y_train[index * batch_size:(index + 1) * batch_size],
                self.indexes:
                indexes_train[index * batch_size:(index + 1) * batch_size]
            })

        self.ll_minibatch_train = theano.function([ index ], T.sum(LogSumExp(T.sum(ll_train, 2), 0) + T.log(1.0 / n_samples)), \

            givens = { self.x: T.tile(self.X_train[ index * batch_size: (index + 1) * batch_size ], [ n_samples, 1, 1 ]),
            self.y: self.y_train[ index * batch_size: (index + 1) * batch_size ],
            self.indexes: indexes_train[ index * batch_size : (index + 1) * batch_size ] })
    def __init__(self,
                 rng,
                 n_in,
                 n_out,
                 n_h,
                 n_r,
                 f_act=leaky_relu,
                 f_out=softmax,
                 obj='c'):
        '''
        :param rng: Numpy RandomState
        :param n_in: Input dimension (int)
        :param n_out: Output dimension (int)
        :param n_h: Hidden dimension (int)
        :param n_r: Number of reflection vectors (int)
        :param f_act: Hidden-to-hidden activation function
        :param f_out: Output activation function
        :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r'])
        '''
        U_ = np.tril(rng.normal(0, 0.01, (n_h, n_r)))
        norms = np.linalg.norm(U_, axis=0)
        U_ = 1. / norms * U_

        Whi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)),
                           np.sqrt(6. / (n_in + n_h)), (n_h, n_in))
        bh_ = np.zeros(n_h)
        Woh_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)),
                           np.sqrt(6. / (n_h + n_out)), (n_out, n_h))
        bo_ = np.zeros(n_out)
        h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)),
                          n_h)

        # Theano: Created shared variables
        Whi = theano.shared(name='Whi',
                            value=Whi_.astype(theano.config.floatX))
        U = theano.shared(name='U', value=U_.astype(theano.config.floatX))
        bh = theano.shared(name='bh', value=bh_.astype(theano.config.floatX))
        Woh = theano.shared(name='Woh',
                            value=Woh_.astype(theano.config.floatX))
        bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX))
        h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX))

        self.p = [U, Whi, Woh, bh, bo, h0]

        seq_len = T.iscalar('seq_len')
        self.seq_len = seq_len

        self.x = T.vector()
        #x_scan = T.shape_padright(self.x)
        x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2)
        if n_h != n_r:  # Number of reflection vectors is less than the hidden dimension

            def forward_prop_step(x_t, h_t_prev):
                h_t = f_act(Whi.dot(x_t) + H_wy(U, h_t_prev) + bh)
                o_t = Woh.dot(h_t) + bo
                return [o_t, h_t]
        else:

            def forward_prop_step(x_t, h_t_prev):
                h_t_prev = T.set_subtensor(h_t_prev[-1],
                                           h_t_prev[-1] * U[-1, -1])
                h_t = f_act(Whi.dot(x_t) + H_wy(U[:, :-1], h_t_prev) + bh)
                o_t = Woh.dot(h_t) + bo
                return [o_t, h_t]

        ## For loop version below (when n_r < n_h)
        # def forward_prop_step(x_t, h_t_prev):
        #     Wh = h_t_prev
        #     for i in range(n_r):
        #         Wh -= 2. * U[:, n_r - i - 1] * T.dot(U[:, n_r - i - 1], Wh)
        #     h_t = f_act(Whi.dot(x_t) + Wh + bh)
        #     o_t = Woh.dot(h_t) + bo
        #     return [o_t, h_t]

        [o_scan, _], _ = theano.scan(forward_prop_step,
                                     sequences=[x_scan],
                                     outputs_info=[None, h0],
                                     n_steps=seq_len)

        if obj == 'c':  # classification task
            self.y = T.bscalar('y')
            self.o = f_out(o_scan[-1])
            #obj function to compute grad, use dropout
            self.cost = T.nnet.categorical_crossentropy(
                self.o,
                T.eye(n_out)[self.y])
            #compute accuracy use average of dropout rate
            self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.)
            self.prediction = np.argmax(self.o)
        elif obj == 'r':  # regression task
            self.y = T.dscalar('y')
            self.o = o_scan[-1]
            #obj function to compute grad, use dropout
            self.cost = (self.o[0] - self.y)**2
            #compute accuracy use average of dropout rate
            self.accuracy = (self.o[0] - self.y)**2
            self.prediction = self.o[0]

        self.optimiser = sgd_optimizer(self, 'oRNN')
Exemple #42
0
 def make_node(self, *inputs):
     inputs = [tt.as_tensor_variable(i) for i in inputs]
     outputs = [tt.bscalar()]
     return gof.Apply(self, inputs, outputs)
Exemple #43
0
    def optimiser(self, num_samples, update, update_kwargs, saved_update=None):

        overdisp_dim = T.bscalar('overdisp_dim')
        overdisp_s = T.bscalar('overdisp_s')

        batch = T.matrix('batch')
        batch_rep = T.repeat(batch, num_samples, axis=0)

        h_regular_rep = self.recognition_model.get_samples_latents_regular(
            batch_rep)
        h_overdisp = self.recognition_model.get_samples_latents_overdisp(
            batch, overdisp_dim)
        h_rep = T.set_subtensor(
            h_regular_rep[overdisp_s::num_samples, overdisp_dim], h_overdisp)

        log_p_h = self.generative_model.log_p_h(h_rep)
        log_p_x = self.generative_model.log_p_x(h_rep, batch_rep)
        entropies_h = self.recognition_model.entropies_latents(
            h_rep, batch_rep)

        log_w_rep = log_p_x + entropies_h + log_p_h
        log_w_matrix = log_w_rep.reshape((batch.shape[0], num_samples))

        v = self.recognition_model.importance_weights_latents(
            h_overdisp, batch, overdisp_dim)

        log_u_matrix = T.repeat(v, num_samples).reshape(
            (batch.shape[0], num_samples)) + log_w_matrix
        log_u_rep = log_u_matrix.flatten()

        log_u_minus_max = log_u_matrix - T.max(
            log_u_matrix, axis=1, keepdims=True)
        u_matrix = T.exp(log_u_minus_max)
        u_normalized_matrix = u_matrix / T.sum(u_matrix, axis=1, keepdims=True)
        u_normalized_rep = T.reshape(u_normalized_matrix, log_w_rep.shape)

        params = self.generative_model.get_params(
        ) + self.recognition_model.get_params()[:-1]

        dummy_vec = T.vector(dtype=theano.config.floatX)
        grads = theano.clone(T.grad(-T.dot(log_u_rep, dummy_vec), params),
                             replace={dummy_vec: u_normalized_rep})

        tau = self.recognition_model.get_params()[-1]

        all_grads, _ = theano.scan(
            lambda s, log_u, u_norm: theano.clone(
                T.grad(-T.dot(log_u[s], dummy_vec), params),
                replace={dummy_vec: u_norm[s]}),
            sequences=[T.arange(log_u_matrix.T.shape[0])],
            non_sequences=[log_u_matrix.T, u_normalized_matrix.T],
        )

        variance = T.sum([T.sum(T.var(g, axis=0)) for g in all_grads])

        grad_tau = T.grad(variance, tau)

        grads += [grad_tau]

        params = self.generative_model.get_params(
        ) + self.recognition_model.get_params()

        update_kwargs['loss_or_grads'] = grads
        update_kwargs['params'] = params

        updates = update(**update_kwargs)

        if saved_update is not None:
            for u_matrix, v in zip(updates, saved_update.keys()):
                u_matrix.set_value(v.get_value())

        optimiser = theano.function(
            inputs=[batch, overdisp_dim, overdisp_s],
            outputs=T.dot(log_u_rep, u_normalized_rep),
            updates=updates,
        )

        return optimiser, updates