Exemple #1
0
def adadelta(lr, tparams, grads, cost, *args):
    """
    An adaptive learning rate optimizer

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [ADADELTA]_.

    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
       Rate Method*, arXiv:1212.5701.
    """
    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.items()]
    running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                 name='%s_rup2' % k)
                   for k, p in tparams.items()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.items()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    grad_input = list(args)
    f_grad_shared = theano.function(grad_input, cost, updates=zgup + rg2up,
                                    name='adadelta_f_grad_shared')

    updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads,
                                     running_up2,
                                     running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [], updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update
Exemple #2
0
    def lstm_layer(self, state_below, dim_proj, mask=None):
        """
        Recurrence with an LSTM hidden unit

        state_below : Is the input. This may be a single sample with
                      multiple timesteps, or a batch
        dim_proj : The dimensionality of the hidden units (projection)
        mask : The mask applied to the input for batching
        """
        # Make sure that we've initialized the tparams
        assert len(self.tparams) > 0
        # State below : steps x samples
        # Recurrence over dim 0
        nsteps = state_below.shape[0]
        # Check if the input is a batch or a single sample
        if state_below.ndim == 3:
            n_samples = state_below.shape[1]
        else:
            n_samples = 1

        if mask is None:
            warnings.warn("You seem to be supplying single samples for \
                           recurrence. You may see speedup gains with using \
                           batches instead.")

        def _slice(_x, n, dim):
            if _x.ndim == 3:
                return _x[:, :, n * dim:(n + 1) * dim]
            return _x[:, n * dim:(n + 1) * dim]

        def _step(m_, x_, h_, c_):
            preact = T.dot(h_, self.tparams[_p(self.prefix, 'U')])
            preact += x_

            i = T.nnet.sigmoid(_slice(preact, 0, dim_proj))
            f = T.nnet.sigmoid(_slice(preact, 1, dim_proj))
            o = T.nnet.sigmoid(_slice(preact, 2, dim_proj))
            c = T.tanh(_slice(preact, 3, dim_proj))
            c = f * c_ + i * c
            c = m_[:, None] * c + (1. - m_)[:, None] * c_

            h = o * T.tanh(c)
            h = m_[:, None] * h + (1. - m_)[:, None] * h_

            return h, c

        state_below = (T.dot(state_below, self.tparams[_p(self.prefix, 'W')]) +
                       self.tparams[_p(self.prefix, 'b')])
        rval, updates = theano.scan(_step,
                                    sequences=[mask, state_below],
                                    outputs_info=[T.alloc(numpy_floatX(0.),
                                                          n_samples,
                                                          dim_proj),
                                                  T.alloc(numpy_floatX(0.),
                                                          n_samples,
                                                          dim_proj)],
                                    name=_p(self.prefix, '_layers'),
                                    n_steps=nsteps)
        return rval[0]
Exemple #3
0
 def shared_dataset(data_xy, borrow=True):
     """ Load the dataset into shared variables """
     data_x, data_y = data_xy
     assert len(data_x) == len(data_y)
     shared_x = theano.shared(numpy_floatX(data_x), borrow=borrow)
     shared_y = theano.shared(numpy_floatX(data_y), borrow=borrow)
     # Cast the labels as int32, so that they can be used as indices
     return shared_x, T.cast(shared_y, 'int32')
Exemple #4
0
    def build_decode(self):
        # Input to start the recurrence with
        trng = RandomStreams(self.random_seed)
        use_noise = theano.shared(numpy_floatX(0.))
        x = T.matrix('x', dtype='int64')
        # Number of steps we want the recurrence to run for
        n_timesteps = T.iscalar('n_timesteps')
        n_samples = x.shape[1]

        # The mask for the first layer has to be all 1s.
        # It does not make sense to complete a sentence for which
        # The mask is 1 1 0 (because it's already complete).
        mask = T.matrix('mask', dtype=theano.config.floatX)
        # This is a dummy mask, we want to consider all hidden states for
        # the second layer when decoding
        mask_2 = T.alloc(numpy_floatX(1.),
                         n_timesteps,
                         n_samples)
        emb = self.tparams['Wemb'][x.flatten()].reshape([x.shape[0],
                                                         x.shape[1],
                                                         self.dim_proj])

        def output_to_input_transform(output):
            """
            output : The previous hidden state (Nxd)
            """
            # N X V
            pre_soft = T.dot(output, self.tparams['U']) + self.tparams['b']
            pred = T.nnet.softmax(pre_soft)
            # N x 1
            pred_argmax = pred.argmax(axis=1)
            # N x d (flatten is probably redundant)
            new_input = self.tparams['Wemb'][pred_argmax.flatten()].reshape([n_samples,
                                                                       self.dim_proj])
            return new_input

        proj_1 = self.layers['lstm_1'].lstm_layer(emb, self.dim_proj, mask=mask, n_steps=n_timesteps,
                                               output_to_input_func=output_to_input_transform)
        if self.use_dropout:
            proj_1 = dropout_layer(proj_1, use_noise, trng)
        proj = self.layers['lstm_2'].lstm_layer(proj_1, self.dim_proj, mask=mask_2)
        if self.use_dropout:
            proj = dropout_layer(proj, use_noise, trng)

        pre_s = T.dot(proj, self.tparams['U']) + self.tparams['b']
        # Softmax works for 2-tensors (matrices) only. We have a 3-tensor
        # TxNxV. So we reshape it to (T*N)xV, apply softmax and reshape again
        # -1 is a proxy for infer dim based on input (numpy style)
        pre_s_r = T.reshape(pre_s, (pre_s.shape[0] * pre_s.shape[1], -1))
        # Softmax will receive all-0s for previously padded entries
        # (T*N) x V
        pred_r = T.nnet.softmax(pre_s_r)
        # T x N
        pred = T.reshape(pred_r, pre_s.shape).argmax(axis=2)
        self.f_decode = theano.function([x, mask, n_timesteps], pred, name='f_decode')

        return use_noise, x, mask, n_timesteps
Exemple #5
0
 def shared_dataset(data_xy, borrow=True):
     """ Load the dataset into shared variables """
     data_x, data_y = data_xy
     assert len(data_x) == len(data_y)
     shared_x = theano.shared(numpy_floatX(data_x),
                              borrow=borrow)
     shared_y = theano.shared(numpy_floatX(data_y),
                              borrow=borrow)
     # Cast the labels as int32, so that they can be used as indices
     return shared_x, T.cast(shared_y, 'int32')
 def shared_dataset(dataset, borrow=True):
     """ Load the dataset into shared variables """
     shared_bucket = {}
     for b, b_data in dataset.iteritems():
         # Make sure we have the same number of entries
         assert b_data[0].shape[-1] == b_data[1].shape[-1] == \
             b_data[2].shape[-1]
         # Make sure the batch size is correct
         assert b_data[0].shape[1] == b
         shared_x = theano.shared(numpy_floatX(b_data[0]), borrow=borrow)
         shared_y = theano.shared(numpy_floatX(b_data[1]), borrow=borrow)
         shared_m = theano.shared(numpy_floatX(b_data[2]), borrow=borrow)
         shared_bucket[b] = [shared_x, T.cast(shared_y, 'int32'), shared_m]
     return shared_bucket
 def shared_dataset(dataset, borrow=True):
     """ Load the dataset into shared variables """
     shared_bucket = {}
     for b, b_data in dataset.iteritems():
         # Make sure we have the same number of entries
         assert b_data[0].shape[-1] == b_data[1].shape[-1] == \
             b_data[2].shape[-1]
         # Make sure the batch size is correct
         assert b_data[0].shape[1] == b
         shared_x = theano.shared(numpy_floatX(b_data[0]),
                                  borrow=borrow)
         shared_y = theano.shared(numpy_floatX(b_data[1]),
                                  borrow=borrow)
         shared_m = theano.shared(numpy_floatX(b_data[2]),
                                  borrow=borrow)
         shared_bucket[b] = [shared_x, T.cast(shared_y, 'int32'), shared_m]
     return shared_bucket
Exemple #8
0
def weight_decay(U, decay_c):
    """
    cost is a Theano expression
    U is a Theano variable
    decay_c is a scalar
    """
    #TODO: Assert the datatypes
    decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
    weight_decay = 0.
    weight_decay += (U ** 2).sum()
    weight_decay *= decay_c
Exemple #9
0
def weight_decay(U, decay_c):
    """
    cost is a Theano expression
    U is a Theano variable
    decay_c is a scalar
    """
    #TODO: Assert the datatypes
    decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
    weight_decay = 0.
    weight_decay += (U**2).sum()
    weight_decay *= decay_c
Exemple #10
0
def norm_init(n_in, n_out, scale=0.01, ortho=True):
    """
    Initialize weights from a scaled standard normal distribution
    Falls back to orthogonal weights if n_in = n_out

    n_in : The input dimension
    n_out : The output dimension
    scale : Scale for the normal distribution
    ortho : Fall back to ortho weights when n_in = n_out
    """
    if n_in == n_out and ortho:
        return ortho_weight(n_in)
    else:
        return numpy_floatX(scale * numpy.random.randn(n_in, n_out))
    def pred_error(self, data, iterator, verbose=False):
        """
        Errors for samples for a trained model
        """
        valid_err = 0
        for _, valid_index in iterator:
            x, mask, y = pad_and_mask([data[0][t] for t in valid_index],
                                      numpy.array(data[1])[valid_index],
                                      maxlen=None)
            preds = self.f_pred(x, mask)
            targets = numpy.array(data[1])[valid_index]
            valid_err += (preds == targets).sum()
        valid_err = 1. - numpy_floatX(valid_err) / len(data[0])

        return valid_err
Exemple #12
0
    def create_unigram_noise_dist(self, wordcount):
        """
        Creates a Unigram noise distribution for NCE

        :type wordcount: dict
        :param wordcount: A dictionary containing frequency counts for words
        """
        counts = numpy.sort(wordcount.values())[::-1]
        # Don't count the UNK and PAD symbols in the second count
        freq = [0, sum(counts[self.n_words:])] \
            + list(counts[:(self.n_words-2)])
        assert len(freq) == self.n_words
        sum_freq = sum(freq)
        noise_distribution = [float(k) / sum_freq for k in freq]
        self.noise_distribution = init_tparams(
            OrderedDict([('noise_d', numpy_floatX(noise_distribution)
                          .reshape(self.n_words,))])
        )['noise_d']
Exemple #13
0
def xavier_init(rng, n_in, n_out, activation, size=None):
    """
    Returns a matrix (n_in X n_out) based on the
    Xavier initialization technique
    """

    if activation not in [T.tanh, T.nnet.sigmoid, T.nnet.relu]:
        warnings.warn("You are using the Xavier init with an \
                       activation function that is not sigmoidal or relu")
    # Default value for size
    if size is None:
        size = (n_in, n_out)
    W_values = numpy_floatX(
        rng.uniform(
            low=-numpy.sqrt(6. / (n_in + n_out)),
            high=numpy.sqrt(6. / (n_in + n_out)),
            size=size,
        ))
    if activation == T.nnet.sigmoid:
        return W_values * 4
    if activation == T.nnet.relu:
        return W_values * numpy.sqrt(2.)
    return W_values
Exemple #14
0
def xavier_init(rng, n_in, n_out, activation, size=None):
    """
    Returns a matrix (n_in X n_out) based on the
    Xavier initialization technique
    """

    if activation not in [T.tanh, T.nnet.sigmoid, T.nnet.relu]:
        warnings.warn("You are using the Xavier init with an \
                       activation function that is not sigmoidal or relu")
    # Default value for size
    if size is None:
        size = (n_in, n_out)
    W_values = numpy_floatX(
        rng.uniform(
            low=-numpy.sqrt(6. / (n_in + n_out)),
            high=numpy.sqrt(6. / (n_in + n_out)),
            size=size,
        ))
    if activation == T.nnet.sigmoid:
        return W_values * 4
    if activation == T.nnet.relu:
        return W_values * numpy.sqrt(2.)
    return W_values
    def build_model(self, encoder='lstm', use_dropout=True):
        use_noise = theano.shared(numpy_floatX(0.))
        x = T.matrix('x', dtype='int64')
        mask = T.matrix('mask', dtype=theano.config.floatX)
        y = T.vector('y', dtype='int64')

        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        emb = self.tparams['Wemb'][x.flatten()].reshape([n_timesteps,
                                                         n_samples,
                                                         self.dim_proj])
        proj = self.layers['lstm'].lstm_layer(emb, self.dim_proj, mask=mask)
        # TODO: What happens when the encoder is not an LSTM
        # This should cleanly fall back to a normal hidden unit
        if encoder == 'lstm':
            #TODO: What the shit is happening here?
            proj = (proj * mask[:, :, None]).sum(axis=0)
            proj = proj / mask.sum(axis=0)[:, None]
        if use_dropout:
            trng = RandomStreams(self.random_seed)
            proj = dropout_layer(proj, use_noise, trng)

        pred = T.nnet.softmax(T.dot(proj, self.tparams['U'])
                              + self.tparams['b'])

        self.f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
        self.f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')

        off = 1e-8
        if pred.dtype == 'float16':
            off = 1e-6

        cost = -T.log(pred[T.arange(n_samples), y] + off).mean()

        return use_noise, x, mask, y, cost
Exemple #16
0
def sgd_optimization_nplm_mlp(learning_rate=1., L1_reg=0.0, L2_reg=0.0001,
                              n_epochs=1000, dataset='../../data/settimes',
                              batch_size=1000, n_in=150, n_h1=750, n_h2=150,
                              context_size=4, use_nce=False, nce_k=100,
                              use_dropout=False, dropout_p=0.5):
    SEED = 1234

    st_data = SeTimes(dataset, emb_dim=n_in)
    print("... Creating the partitions")
    train, valid = st_data.load_data(context_size=context_size)
    print("... Done creating partitions")

    print("... Building the model")
    # Symbolic variables for input and output for a batch
    x = T.imatrix('x')
    y = T.ivector('y')
    y_flat = T.ivector('y_flat')
    lr = T.scalar(name='lr')
    k = T.scalar(name='k')

    emb_x = st_data.dictionary.tparams['Wemb'][x.flatten()] \
        .reshape([x.shape[0], context_size * n_in])

    rng = numpy.random.RandomState(SEED)
    trng = RandomStreams(SEED)
    use_noise = theano.shared(numpy_floatX(0.))

    nce_q = st_data.dictionary.noise_distribution
    nce_samples = T.matrix('noise_s')

    model = NPLM(
        rng=rng,
        input=emb_x,
        n_in=context_size * n_in,
        n_h1=n_h1,
        n_h2=n_h2,
        n_out=st_data.dictionary.num_words(),
        use_nce=use_nce
    )

    tparams = OrderedDict()
    for i, nplm_m in enumerate(model.params):
        tparams['nplm_' + str(i)] = nplm_m
    tparams['Wemb'] = st_data.dictionary.Wemb

    # Cost to minimize
    if use_nce:
        #cost = model.loss(y, nce_samples, nce_q)
        cost = model.loss(y, y_flat, nce_samples, nce_q, k)
    else:
        # MLE via softmax
        cost = model.loss(y)
    # Add L2 reg to the cost
    cost += L2_reg * model.L2

    grads = T.grad(cost, wrt=list(tparams.values()))

    if use_nce:
        f_cost = theano.function([x, y, y_flat, nce_samples, k],
                                 cost, name='f_cost')
        f_grad_shared, f_update = sgd(lr, tparams, grads,
                                      cost, x, y, y_flat, nce_samples, k)
    else:
        f_cost = theano.function([x, y], cost, name='f_cost')
        f_grad_shared, f_update = gd(lr, tparams, grads,
                                     cost, x, y)

    print("... Optimization")
    kf_valid = get_minibatches_idx(len(valid[0]), batch_size)
    print("%d training examples" % len(train[0]))
    print("%d valid examples" % len(valid[0]))

    disp_freq = 10
    valid_freq = len(train[0]) // batch_size
    save_freq = len(train[0]) // batch_size

    uidx = 0
    estop = False
    start_time = time.time()
    total_output_words = st_data.dictionary.num_words()
    for eidx in range(n_epochs):
        n_samples = 0
        # Shuffle and get training stuff
        kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
        for _, train_index in kf:
            uidx += 1
            use_noise.set_value(1.)

            x_batch = [train[0][t] for t in train_index]
            y_batch = [train[1][t] for t in train_index]
            y_f_batch = [train[1][t] + i * st_data.dictionary.num_words()
                         for i, t in enumerate(train_index)]
            # Convert x and y into numpy objects
            x_batch = numpy.asarray(x_batch, dtype='int32')
            y_batch = numpy.asarray(y_batch, dtype='int32')
            y_f_batch = numpy.asarray(y_f_batch, dtype='int32')

            local_batch_size = x_batch.shape[0]
            if use_nce:
                # Create noise samples to be passed as well
                # Expected size is (bs, k)
                # Don't sample UNK and PAD
                noisy_samples = numpy.zeros((local_batch_size,
                                             st_data.dictionary.num_words()),
                                            dtype='float32')
                # The following will mask approximately (repeats permitted) 100
                # values with the value 1. This represents the noise samples in
                # the vocab
                noisy_samples[
                    numpy.arange(local_batch_size).reshape(local_batch_size,
                                                           1),
                    numpy.random.randint(2, total_output_words,
                                         size=(local_batch_size, nce_k))
                ] = 1.
                loss = f_grad_shared(x_batch, y_batch, y_f_batch,
                                     noisy_samples, nce_k)
            else:
                loss = f_grad_shared(x_batch, y_batch)
            f_update(learning_rate)

            if numpy.isnan(loss) or numpy.isinf(loss):
                print('bad cost detected: ', loss)
                return 1., 1.

            if numpy.mod(uidx, disp_freq) == 0:
                print('Epoch', eidx, 'Update', uidx, 'Cost', loss)


    end_time = time.time()
    print('Training took %.1fs' % (end_time - start_time))
    f_grad_shared.profile.print_summary()
Exemple #17
0
    def __init__(self,
                 rng,
                 input,
                 n_in,
                 n_h1,
                 n_h2,
                 n_out,
                 use_dropout=False,
                 trng=None,
                 dropout_p=0.5,
                 use_noise=theano.shared(numpy_floatX(0.)),
                 use_nce=False):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
        architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        # This first hidden layer
        # The input is the concatenated word embeddings for all
        # words in the context input and the batch
        self.h1 = DenseLayer(rng=rng,
                             input=input,
                             n_in=n_in,
                             n_out=n_h1,
                             activation=T.nnet.relu)

        # Use dropout if specified
        h2_input = self.h1.output
        if (use_dropout):
            assert trng is not None
            h2_input = dropout_layer(self.h1.output, use_noise, trng,
                                     dropout_p)

        # The second hidden layer
        self.h2 = DenseLayer(rng=rng,
                             input=h2_input,
                             n_in=n_h1,
                             n_out=n_h2,
                             activation=T.nnet.relu)

        # Apply dropout if specified
        log_reg_input = self.h2.output
        if (use_dropout):
            log_reg_input = dropout_layer(self.h2.output, use_noise, trng,
                                          dropout_p)

        # The logistic regression layer
        self.log_regression_layer = LogisticRegression(input=log_reg_input,
                                                       n_in=n_h2,
                                                       n_out=n_out)

        # Use L2 regularization, for the log-regression layer only
        self.L2 = reg.L2([self.log_regression_layer.W])
        # Get the NLL loss function from the logistic regression layer
        if use_nce:
            self.loss = self.log_regression_layer.nce_loss
        else:
            self.loss = self.log_regression_layer.loss

        # Bundle params (to be used for computing gradients)
        self.params = self.h1.params + self.h2.params + \
            self.log_regression_layer.params

        # Keeo track of the input (For debugging only)
        self.input = input
Exemple #18
0
    def build_model(self):
        trng = RandomStreams(self.random_seed)
        use_noise = theano.shared(numpy_floatX(0.))
        # Simply encode this
        x = T.matrix('x', dtype='int64')
        y = T.matrix('y', dtype='int64')
        y_prime = T.roll(y, -1, 0)
        # Since we are simply predicting the next word, the
        # following statement shifts the content of the x by 1
        # in the time dimension for prediction (axis 0, assuming TxN)
        mask_x = T.matrix('mask_x', dtype=theano.config.floatX)
        mask_y = T.matrix('mask_y', dtype=theano.config.floatX)

        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        # Convert word indices to their embeddings
        # Resulting dims are (T x N x dim_proj)
        emb = self.tparams['Wemb'][x.flatten()].reshape([n_timesteps,
                                                         n_samples,
                                                         self.dim_proj])
        # Compute the hidden states
        # Note that these contain hidden states for elements which were
        # padded in input. The cost for these time steps are removed
        # before the calculation of the cost.
        enc_proj_1 = self.layers['enc_lstm_1'].lstm_layer(emb, self.dim_proj, mask=mask)
        # Use dropout on non-recurrent connections (Zaremba et al.)
        if self.use_dropout:
            proj_1 = dropout_layer(enc_proj_1, use_noise, trng)
        enc_proj_2 = self.layers['enc_lstm_2'].lstm_layer(enc_proj_1, self.dim_proj, mask=mask)
        if self.use_dropout:
            enc_proj_2 = dropout_layer(enc_proj_2, use_noise, trng)

        # Use the final state of the encoder as the initial hidden state of the decoder
        src_embedding = enc_proj_2[-1]
        # Run decoder LSTM
        dec_proj_1 = self.layers['enc_lstm_1'].lstm_layer(emb, self.dim_proj, mask=mask)
        # Use dropout on non-recurrent connections (Zaremba et al.)
        if self.use_dropout:
            proj_1 = dropout_layer(enc_proj_1, use_noise, trng)
        enc_proj_2 = self.layers['enc_lstm_2'].lstm_layer(enc_proj_1, self.dim_proj, mask=mask)
        if self.use_dropout:
            enc_proj_2 = dropout_layer(enc_proj_2, use_noise, trng)

        pre_s = T.dot(proj, self.tparams['U']) + self.tparams['b']
        # Softmax works for 2-tensors (matrices) only. We have a 3-tensor
        # TxNxV. So we reshape it to (T*N)xV, apply softmax and reshape again
        # -1 is a proxy for infer dim based on input (numpy style)
        pre_s_r = T.reshape(pre_s, (pre_s.shape[0] * pre_s.shape[1], -1))
        pred_r = T.nnet.softmax(pre_s_r)

        off = 1e-8
        if pred_r.dtype == 'float16':
            off = 1e-6

        # Note the use of flatten here. We can't directly index a 3-tensor
        # and hence we use the (T*N)xV view which is indexed by the flattened
        # label matrix, dim = (T*N)x1
        # Also, the cost (before calculating the mean) is multiplied (element-wise)
        # with the mask to eliminate the cost of elements that do not really exist.
        # i.e. Do not include the cost for elements which are padded
        cost = -T.sum(T.log(pred_r[T.arange(pred_r.shape[0]), y.flatten()] + off) * mask.flatten()) / T.sum(mask)

        self.f_cost = theano.function([x, mask], cost, name='f_cost')

        return use_noise, x, mask, cost
Exemple #19
0
def rmsprop(lr, tparams, grads, cost, *args):
    """
    A variant of  SGD that scales the step size by running average of the
    recent step norms.

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [Hint2014]_.

    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
       lecture 6a,
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
    """

    zipped_grads = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k)
        for k, p in tparams.items()
    ]
    running_grads = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k)
        for k, p in tparams.items()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k)
        for k, p in tparams.items()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]

    grad_input = list(args)
    f_grad_shared = theano.function(grad_input,
                                    cost,
                                    updates=zgup + rgup + rg2up,
                                    name='rmsprop_f_grad_shared')

    updir = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k)
        for k, p in tparams.items()
    ]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / T.sqrt(rg2 - rg**2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)]
    f_update = theano.function([lr], [],
                               updates=updir_new + param_up,
                               on_unused_input='ignore',
                               name='rmsprop_f_update')

    return f_grad_shared, f_update
Exemple #20
0
def adadelta(lr, tparams, grads, cost, *args):
    """
    An adaptive learning rate optimizer

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [ADADELTA]_.

    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
       Rate Method*, arXiv:1212.5701.
    """
    zipped_grads = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k)
        for k, p in tparams.items()
    ]
    running_up2 = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k)
        for k, p in tparams.items()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k)
        for k, p in tparams.items()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]

    grad_input = list(args)
    f_grad_shared = theano.function(grad_input,
                                    cost,
                                    updates=zgup + rg2up,
                                    name='adadelta_f_grad_shared')

    updir = [
        -T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
        for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)
    ]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [],
                               updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update
Exemple #21
0
    def __init__(self, rng, input, n_in, n_h1, n_h2, n_out,
                 use_dropout=False, trng=None, dropout_p=0.5,
                 use_noise=theano.shared(numpy_floatX(0.)),
                 use_nce=False):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
        architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        # This first hidden layer
        # The input is the concatenated word embeddings for all
        # words in the context input and the batch
        self.h1 = DenseLayer(
            rng=rng,
            input=input,
            n_in=n_in,
            n_out=n_h1,
            activation=T.nnet.relu
        )

        # Use dropout if specified
        h2_input = self.h1.output
        if (use_dropout):
            assert trng is not None
            h2_input = dropout_layer(self.h1.output, use_noise,
                                     trng, dropout_p)

        # The second hidden layer
        self.h2 = DenseLayer(
            rng=rng,
            input=h2_input,
            n_in=n_h1,
            n_out=n_h2,
            activation=T.nnet.relu
        )

        # Apply dropout if specified
        log_reg_input = self.h2.output
        if (use_dropout):
            log_reg_input = dropout_layer(self.h2.output, use_noise,
                                          trng, dropout_p)

        # The logistic regression layer
        self.log_regression_layer = LogisticRegression(
            input=log_reg_input,
            n_in=n_h2,
            n_out=n_out
        )

        # Use L2 regularization, for the log-regression layer only
        self.L2 = reg.L2([self.log_regression_layer.W])
        # Get the NLL loss function from the logistic regression layer
        if use_nce:
            self.loss = self.log_regression_layer.nce_loss
        else:
            self.loss = self.log_regression_layer.loss

        # Bundle params (to be used for computing gradients)
        self.params = self.h1.params + self.h2.params + \
            self.log_regression_layer.params

        # Keeo track of the input (For debugging only)
        self.input = input
Exemple #22
0
    def build_model(self):
        trng = RandomStreams(self.random_seed)
        use_noise = theano.shared(numpy_floatX(0.))
        x = T.matrix('x', dtype='int64')
        # Since we are simply predicting the next word, the
        # following statement shifts the content of the x by 1
        # in the time dimension for prediction (axis 0, assuming TxN)
        y = T.roll(x, -1, 0)
        mask = T.matrix('mask', dtype=theano.config.floatX)

        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        # Convert word indices to their embeddings
        # Resulting dims are (T x N x dim_proj)
        emb = self.tparams['Wemb'][x.flatten()].reshape(
            [n_timesteps, n_samples, self.dim_proj])
        # Dropout input if necessary
        if self.use_dropout:
            emb = dropout_layer(emb, use_noise, trng)

        # Compute the hidden states
        # Note that these contain hidden states for elements which were
        # padded in input. The cost for these time steps are removed
        # before the calculation of the cost.
        proj_1 = self.layers['lstm_1'].lstm_layer(
            emb, mask=mask, restore_final_to_initial_hidden=True)
        # Use dropout on non-recurrent connections (Zaremba et al.)
        if self.use_dropout:
            proj_1 = dropout_layer(proj_1, use_noise, trng)
        proj = self.layers['lstm_2'].lstm_layer(
            proj_1, mask=mask, restore_final_to_initial_hidden=True)
        if self.use_dropout:
            proj = dropout_layer(proj, use_noise, trng)

        pre_s_lstm = self.layers['logit_lstm'].logit_layer(proj)
        pre_s_input = self.layers['logit_prev_word'].logit_layer(emb)
        pre_s = self.layers['logit'].logit_layer(
            T.tanh(pre_s_lstm + pre_s_input))
        # Softmax works for 2-tensors (matrices) only. We have a 3-tensor
        # TxNxV. So we reshape it to (T*N)xV, apply softmax and reshape again
        # -1 is a proxy for infer dim based on input (numpy style)
        pre_s_r = T.reshape(pre_s, (pre_s.shape[0] * pre_s.shape[1], -1))
        pred_r = T.nnet.softmax(pre_s_r)

        off = 1e-8
        if pred_r.dtype == 'float16':
            off = 1e-6

        # Note the use of flatten here. We can't directly index a 3-tensor
        # and hence we use the (T*N)xV view which is indexed by the flattened
        # label matrix, dim = (T*N)x1
        # Also, the cost (before calculating the mean) is multiplied (element-wise)
        # with the mask to eliminate the cost of elements that do not really exist.
        # i.e. Do not include the cost for elements which are padded
        cost = -T.sum(
            T.log(pred_r[T.arange(pred_r.shape[0]),
                         y.flatten()] + off) * mask.flatten()) / T.sum(mask)

        self.f_cost = theano.function([x, mask], cost, name='f_cost')

        return use_noise, x, mask, cost
Exemple #23
0
    def build_decode(self):
        # Input to start the recurrence with
        trng = RandomStreams(self.random_seed)
        use_noise = theano.shared(numpy_floatX(0.))
        x = T.matrix('x', dtype='int64')
        # Number of steps we want the recurrence to run for
        n_timesteps = T.iscalar('n_timesteps')
        n_samples = x.shape[1]

        # The mask for the first layer has to be all 1s.
        # It does not make sense to complete a sentence for which
        # The mask is 1 1 0 (because it's already complete).
        mask = T.matrix('mask', dtype=theano.config.floatX)
        # This is a dummy mask, we want to consider all hidden states for
        # the second layer when decoding
        mask_2 = T.alloc(numpy_floatX(1.), n_timesteps, n_samples)
        emb = self.tparams['Wemb'][x.flatten()].reshape(
            [x.shape[0], x.shape[1], self.dim_proj])

        def output_to_input_transform(output, emb):
            """
            output : The previous hidden state (Nxd)
            """
            # N X V
            pre_soft_lstm = self.layers['logit_lstm'].logit_layer(output)
            pre_soft_input = self.layers['logit_prev_word'].logit_layer(emb)
            pre_soft = self.layers['logit'].logit_layer(
                T.tanh(pre_s_lstm + pre_s_input))
            pred = T.nnet.softmax(pre_soft)
            # N x 1
            pred_argmax = pred.argmax(axis=1)
            # N x d (flatten is probably redundant)
            new_input = self.tparams['Wemb'][pred_argmax.flatten()].reshape(
                [n_samples, self.dim_proj])
            return new_input

        proj_1 = self.layers['lstm_1'].lstm_layer(
            emb,
            self.dim_proj,
            mask=mask,
            n_steps=n_timesteps,
            output_to_input_func=output_to_input_transform)
        if self.use_dropout:
            proj_1 = dropout_layer(proj_1, use_noise, trng)
        proj = self.layers['lstm_2'].lstm_layer(proj_1,
                                                self.dim_proj,
                                                mask=mask_2)
        if self.use_dropout:
            proj = dropout_layer(proj, use_noise, trng)

        pre_s_lstm = self.layers['logit_lstm'].logit_layer(proj)
        pre_s_input = self.layers['logit_prev_word'].logit_layer(emb)
        pre_s = self.layers['logit'].logit_layer(
            T.tanh(pre_s_lstm + pre_s_input))
        # Softmax works for 2-tensors (matrices) only. We have a 3-tensor
        # TxNxV. So we reshape it to (T*N)xV, apply softmax and reshape again
        # -1 is a proxy for infer dim based on input (numpy style)
        pre_s_r = T.reshape(pre_s, (pre_s.shape[0] * pre_s.shape[1], -1))
        # Softmax will receive all-0s for previously padded entries
        # (T*N) x V
        pred_r = T.nnet.softmax(pre_s_r)
        # T x N
        pred = (T.reshape(pred_r, pre_s.shape)[:, :, 2:]).argmax(axis=2) + 2
        self.f_decode = theano.function([x, mask, n_timesteps],
                                        pred,
                                        name='f_decode')

        return use_noise, x, mask, n_timesteps
Exemple #24
0
    def gru_layer(self,
                  state_below,
                  mask=None,
                  n_steps=None,
                  output_to_input_func=None,
                  restore_final_to_initial_hidden=False):
        """
        Recurrence with an LSTM hidden unit

        state_below : Is the input. This may be a single sample with
                      multiple timesteps, or a batch
        mask : The mask applied to the input for batching
        n_steps : The number of steps for which this recurrence should be run
                  This is only required with partial input. For any step
                  where no input is available, the output_to_input_func is
                  applied to the previous output and is then used as input
        output_to_input_func : The function to be applied to generate input
                               when partial input is available
        restore_final_to_initial_hidden : Use the final hidden state as the initial
                                  hidden state for the next batch
                                  WARNING : Assumes that batches are of the
                                  same size since the size of the initial
                                  state is fixed to Nxd
                                  TODO: Possibly think about averaging
                                  final states to make this number of sample
                                  independent
        """
        # Make sure that we've initialized the tparams
        assert len(self.tparams) > 0
        # State below : steps x samples x dim_proj
        # If n_steps is not provided, infer it
        if n_steps is None:
            nsteps = state_below.shape[0]
        else:
            # If n_steps is provided, this is the incomplete input setting
            # Make sure that a function is provided to transform output
            # from previous time step to input
            # TODO: This output function may require input from several time
            # steps instead of just the previous one. Make this modification
            nsteps = n_steps
            if output_to_input_func is None:
                raise Exception('n_steps was given to the GRU but no output \
                                 to input function was specified')

        # Hack to make sure that the theano ifelse compiles
        if output_to_input_func is None:
            output_to_input_func = dummy_func

        # Check if the input is a batch or a single sample
        if state_below.ndim == 3:
            n_samples = state_below.shape[1]
        else:
            n_samples = 1
            warnings.warn("You seem to be supplying single samples for \
                           recurrence. You may see speedup gains with using \
                           batches instead.")

        # Initialize mask if not specified
        if mask is None:
            if state_below.ndim == 3:
                mask = T.alloc(numpy_floatX(1.), nsteps, n_samples)
            else:
                mask = T.alloc(numpy_floatX(1.), n_samples)

        # Initialize initial hidden state if not specified
        # Restore final hidden state to new initial hidden state
        if restore_final_to_initial_hidden and self.h_final is not None:
            h0 = self.h_final
        else:
            h0 = T.alloc(numpy_floatX(0.), n_samples, self.dim_proj)

        def _slice(_x, n, dim):
            if _x.ndim == 3:
                return _x[:, :, n * dim:(n + 1) * dim]
            return _x[:, n * dim:(n + 1) * dim]

        # TODO: Initialize mask if it is none
        # TODO; Make the same change to the LSTM module
        def _step(t_, h_, mask, state_below, state_below_h_c):
            """
            m_ is the mask for this timestep (N x 1)
            x_ is the input for this time step (pre-multiplied with the
              weight matrices). ie.
              x_ = (X.W + b)[t]
            h_ is the previous hidden state
            c_ is the previous LSTM context
            """
            preact = T.dot(h_, self.tparams[_p(self.prefix, 'U')])
            x_ = ifelse(
                T.lt(t_, state_below.shape[0]), state_below[t_],
                T.dot(output_to_input_func(h_), self.tparams[_p(
                    self.prefix, 'W')]) + self.tparams[_p(self.prefix, 'b')])
            preact += x_

            # The input to the sigmoid is preact[:, :, 0:d]
            # Similar slices are used for the rest of the gates
            r = T.nnet.sigmoid(_slice(preact, 0, self.dim_proj))
            z = T.nnet.sigmoid(_slice(preact, 1, self.dim_proj))

            # The proposal hidden state
            preact_h = T.dot(h_, self.tparams[_p(self.prefix, 'U_h')])
            preact_h = preact_h * r

            h_c_ = ifelse(
                T.lt(t_, state_below_h_c.shape[0]), state_below_h_c[t_],
                T.dot(output_to_input_func(h_), self.tparams[_p(
                    self.prefix, 'W_h')]) +
                self.tparams[_p(self.prefix, 'b_h')])
            # TODO : xx_
            preact_h = preact_h + h_c_
            h = T.tanh(preact_h)

            h = z * h_ + (1 - z) * h
            # None adds a dimension to the mask (N,) -> (N, 1)
            # Where the mask value is 1, use the value of the current
            # context, otherwise use the one from the previous
            # context when the mask value is 0
            # This will ensure that values generated for absent
            # elements marked with <PAD> will not be used
            # Similarly, Where the mask value is 1, use the value of the current
            # hidden state, otherwise use the one from the previous
            # state when the mask value is 0
            h = ifelse(T.lt(t_, state_below.shape[0]),
                       mask[t_][:, None] * h + (1. - mask[t_])[:, None] * h_,
                       h)

            return h

        state_below = (T.dot(state_below, self.tparams[_p(self.prefix, 'W')]) +
                       self.tparams[_p(self.prefix, 'b')])
        # Transformation to calculate the candidate hidden state
        state_below_h_c = (
            T.dot(state_below, self.tparams[_p(self.prefix, 'W_h')]) +
            self.tparams[_p(self.prefix, 'b_h')])
        rval, updates = theano.scan(
            _step,
            sequences=[T.arange(nsteps)],
            outputs_info=[h0],
            non_sequences=[mask, state_below, state_below_h_c],
            name=_p(self.prefix, '_layers'),
            n_steps=nsteps)
        # Save the final state to be used as the next initial hidden state
        if restore_final_to_initial_hidden:
            self.h_final = rval[0][-1]

        # Returns a list of the hidden states (t elements of N x dim_proj)
        return rval[0]
Exemple #25
0
def rmsprop(lr, tparams, grads, cost, *args):
    """
    A variant of  SGD that scales the step size by running average of the
    recent step norms.

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [Hint2014]_.

    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
       lecture 6a,
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
    """

    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.items()]
    running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                   name='%s_rgrad' % k)
                     for k, p in tparams.items()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.items()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    grad_input = list(args)
    f_grad_shared = theano.function(grad_input, cost,
                                    updates=zgup + rgup + rg2up,
                                    name='rmsprop_f_grad_shared')

    updir = [theano.shared(p.get_value() * numpy_floatX(0.),
                           name='%s_updir' % k)
             for k, p in tparams.items()]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / T.sqrt(rg2 - rg ** 2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1])
                for p, udn in zip(tparams.values(), updir_new)]
    f_update = theano.function([lr], [], updates=updir_new + param_up,
                               on_unused_input='ignore',
                               name='rmsprop_f_update')

    return f_grad_shared, f_update