def adadelta(lr, tparams, grads, cost, *args): """ An adaptive learning rate optimizer Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs mask: Theano variable Sequence mask y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [ADADELTA]_. .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning Rate Method*, arXiv:1212.5701. """ zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items()] running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.items()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] grad_input = list(args) f_grad_shared = theano.function(grad_input, cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def lstm_layer(self, state_below, dim_proj, mask=None): """ Recurrence with an LSTM hidden unit state_below : Is the input. This may be a single sample with multiple timesteps, or a batch dim_proj : The dimensionality of the hidden units (projection) mask : The mask applied to the input for batching """ # Make sure that we've initialized the tparams assert len(self.tparams) > 0 # State below : steps x samples # Recurrence over dim 0 nsteps = state_below.shape[0] # Check if the input is a batch or a single sample if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 if mask is None: warnings.warn("You seem to be supplying single samples for \ recurrence. You may see speedup gains with using \ batches instead.") def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_): preact = T.dot(h_, self.tparams[_p(self.prefix, 'U')]) preact += x_ i = T.nnet.sigmoid(_slice(preact, 0, dim_proj)) f = T.nnet.sigmoid(_slice(preact, 1, dim_proj)) o = T.nnet.sigmoid(_slice(preact, 2, dim_proj)) c = T.tanh(_slice(preact, 3, dim_proj)) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * T.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c state_below = (T.dot(state_below, self.tparams[_p(self.prefix, 'W')]) + self.tparams[_p(self.prefix, 'b')]) rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[T.alloc(numpy_floatX(0.), n_samples, dim_proj), T.alloc(numpy_floatX(0.), n_samples, dim_proj)], name=_p(self.prefix, '_layers'), n_steps=nsteps) return rval[0]
def shared_dataset(data_xy, borrow=True): """ Load the dataset into shared variables """ data_x, data_y = data_xy assert len(data_x) == len(data_y) shared_x = theano.shared(numpy_floatX(data_x), borrow=borrow) shared_y = theano.shared(numpy_floatX(data_y), borrow=borrow) # Cast the labels as int32, so that they can be used as indices return shared_x, T.cast(shared_y, 'int32')
def build_decode(self): # Input to start the recurrence with trng = RandomStreams(self.random_seed) use_noise = theano.shared(numpy_floatX(0.)) x = T.matrix('x', dtype='int64') # Number of steps we want the recurrence to run for n_timesteps = T.iscalar('n_timesteps') n_samples = x.shape[1] # The mask for the first layer has to be all 1s. # It does not make sense to complete a sentence for which # The mask is 1 1 0 (because it's already complete). mask = T.matrix('mask', dtype=theano.config.floatX) # This is a dummy mask, we want to consider all hidden states for # the second layer when decoding mask_2 = T.alloc(numpy_floatX(1.), n_timesteps, n_samples) emb = self.tparams['Wemb'][x.flatten()].reshape([x.shape[0], x.shape[1], self.dim_proj]) def output_to_input_transform(output): """ output : The previous hidden state (Nxd) """ # N X V pre_soft = T.dot(output, self.tparams['U']) + self.tparams['b'] pred = T.nnet.softmax(pre_soft) # N x 1 pred_argmax = pred.argmax(axis=1) # N x d (flatten is probably redundant) new_input = self.tparams['Wemb'][pred_argmax.flatten()].reshape([n_samples, self.dim_proj]) return new_input proj_1 = self.layers['lstm_1'].lstm_layer(emb, self.dim_proj, mask=mask, n_steps=n_timesteps, output_to_input_func=output_to_input_transform) if self.use_dropout: proj_1 = dropout_layer(proj_1, use_noise, trng) proj = self.layers['lstm_2'].lstm_layer(proj_1, self.dim_proj, mask=mask_2) if self.use_dropout: proj = dropout_layer(proj, use_noise, trng) pre_s = T.dot(proj, self.tparams['U']) + self.tparams['b'] # Softmax works for 2-tensors (matrices) only. We have a 3-tensor # TxNxV. So we reshape it to (T*N)xV, apply softmax and reshape again # -1 is a proxy for infer dim based on input (numpy style) pre_s_r = T.reshape(pre_s, (pre_s.shape[0] * pre_s.shape[1], -1)) # Softmax will receive all-0s for previously padded entries # (T*N) x V pred_r = T.nnet.softmax(pre_s_r) # T x N pred = T.reshape(pred_r, pre_s.shape).argmax(axis=2) self.f_decode = theano.function([x, mask, n_timesteps], pred, name='f_decode') return use_noise, x, mask, n_timesteps
def shared_dataset(dataset, borrow=True): """ Load the dataset into shared variables """ shared_bucket = {} for b, b_data in dataset.iteritems(): # Make sure we have the same number of entries assert b_data[0].shape[-1] == b_data[1].shape[-1] == \ b_data[2].shape[-1] # Make sure the batch size is correct assert b_data[0].shape[1] == b shared_x = theano.shared(numpy_floatX(b_data[0]), borrow=borrow) shared_y = theano.shared(numpy_floatX(b_data[1]), borrow=borrow) shared_m = theano.shared(numpy_floatX(b_data[2]), borrow=borrow) shared_bucket[b] = [shared_x, T.cast(shared_y, 'int32'), shared_m] return shared_bucket
def weight_decay(U, decay_c): """ cost is a Theano expression U is a Theano variable decay_c is a scalar """ #TODO: Assert the datatypes decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (U ** 2).sum() weight_decay *= decay_c
def weight_decay(U, decay_c): """ cost is a Theano expression U is a Theano variable decay_c is a scalar """ #TODO: Assert the datatypes decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (U**2).sum() weight_decay *= decay_c
def norm_init(n_in, n_out, scale=0.01, ortho=True): """ Initialize weights from a scaled standard normal distribution Falls back to orthogonal weights if n_in = n_out n_in : The input dimension n_out : The output dimension scale : Scale for the normal distribution ortho : Fall back to ortho weights when n_in = n_out """ if n_in == n_out and ortho: return ortho_weight(n_in) else: return numpy_floatX(scale * numpy.random.randn(n_in, n_out))
def pred_error(self, data, iterator, verbose=False): """ Errors for samples for a trained model """ valid_err = 0 for _, valid_index in iterator: x, mask, y = pad_and_mask([data[0][t] for t in valid_index], numpy.array(data[1])[valid_index], maxlen=None) preds = self.f_pred(x, mask) targets = numpy.array(data[1])[valid_index] valid_err += (preds == targets).sum() valid_err = 1. - numpy_floatX(valid_err) / len(data[0]) return valid_err
def create_unigram_noise_dist(self, wordcount): """ Creates a Unigram noise distribution for NCE :type wordcount: dict :param wordcount: A dictionary containing frequency counts for words """ counts = numpy.sort(wordcount.values())[::-1] # Don't count the UNK and PAD symbols in the second count freq = [0, sum(counts[self.n_words:])] \ + list(counts[:(self.n_words-2)]) assert len(freq) == self.n_words sum_freq = sum(freq) noise_distribution = [float(k) / sum_freq for k in freq] self.noise_distribution = init_tparams( OrderedDict([('noise_d', numpy_floatX(noise_distribution) .reshape(self.n_words,))]) )['noise_d']
def xavier_init(rng, n_in, n_out, activation, size=None): """ Returns a matrix (n_in X n_out) based on the Xavier initialization technique """ if activation not in [T.tanh, T.nnet.sigmoid, T.nnet.relu]: warnings.warn("You are using the Xavier init with an \ activation function that is not sigmoidal or relu") # Default value for size if size is None: size = (n_in, n_out) W_values = numpy_floatX( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=size, )) if activation == T.nnet.sigmoid: return W_values * 4 if activation == T.nnet.relu: return W_values * numpy.sqrt(2.) return W_values
def build_model(self, encoder='lstm', use_dropout=True): use_noise = theano.shared(numpy_floatX(0.)) x = T.matrix('x', dtype='int64') mask = T.matrix('mask', dtype=theano.config.floatX) y = T.vector('y', dtype='int64') n_timesteps = x.shape[0] n_samples = x.shape[1] emb = self.tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, self.dim_proj]) proj = self.layers['lstm'].lstm_layer(emb, self.dim_proj, mask=mask) # TODO: What happens when the encoder is not an LSTM # This should cleanly fall back to a normal hidden unit if encoder == 'lstm': #TODO: What the shit is happening here? proj = (proj * mask[:, :, None]).sum(axis=0) proj = proj / mask.sum(axis=0)[:, None] if use_dropout: trng = RandomStreams(self.random_seed) proj = dropout_layer(proj, use_noise, trng) pred = T.nnet.softmax(T.dot(proj, self.tparams['U']) + self.tparams['b']) self.f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') self.f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') off = 1e-8 if pred.dtype == 'float16': off = 1e-6 cost = -T.log(pred[T.arange(n_samples), y] + off).mean() return use_noise, x, mask, y, cost
def sgd_optimization_nplm_mlp(learning_rate=1., L1_reg=0.0, L2_reg=0.0001, n_epochs=1000, dataset='../../data/settimes', batch_size=1000, n_in=150, n_h1=750, n_h2=150, context_size=4, use_nce=False, nce_k=100, use_dropout=False, dropout_p=0.5): SEED = 1234 st_data = SeTimes(dataset, emb_dim=n_in) print("... Creating the partitions") train, valid = st_data.load_data(context_size=context_size) print("... Done creating partitions") print("... Building the model") # Symbolic variables for input and output for a batch x = T.imatrix('x') y = T.ivector('y') y_flat = T.ivector('y_flat') lr = T.scalar(name='lr') k = T.scalar(name='k') emb_x = st_data.dictionary.tparams['Wemb'][x.flatten()] \ .reshape([x.shape[0], context_size * n_in]) rng = numpy.random.RandomState(SEED) trng = RandomStreams(SEED) use_noise = theano.shared(numpy_floatX(0.)) nce_q = st_data.dictionary.noise_distribution nce_samples = T.matrix('noise_s') model = NPLM( rng=rng, input=emb_x, n_in=context_size * n_in, n_h1=n_h1, n_h2=n_h2, n_out=st_data.dictionary.num_words(), use_nce=use_nce ) tparams = OrderedDict() for i, nplm_m in enumerate(model.params): tparams['nplm_' + str(i)] = nplm_m tparams['Wemb'] = st_data.dictionary.Wemb # Cost to minimize if use_nce: #cost = model.loss(y, nce_samples, nce_q) cost = model.loss(y, y_flat, nce_samples, nce_q, k) else: # MLE via softmax cost = model.loss(y) # Add L2 reg to the cost cost += L2_reg * model.L2 grads = T.grad(cost, wrt=list(tparams.values())) if use_nce: f_cost = theano.function([x, y, y_flat, nce_samples, k], cost, name='f_cost') f_grad_shared, f_update = sgd(lr, tparams, grads, cost, x, y, y_flat, nce_samples, k) else: f_cost = theano.function([x, y], cost, name='f_cost') f_grad_shared, f_update = gd(lr, tparams, grads, cost, x, y) print("... Optimization") kf_valid = get_minibatches_idx(len(valid[0]), batch_size) print("%d training examples" % len(train[0])) print("%d valid examples" % len(valid[0])) disp_freq = 10 valid_freq = len(train[0]) // batch_size save_freq = len(train[0]) // batch_size uidx = 0 estop = False start_time = time.time() total_output_words = st_data.dictionary.num_words() for eidx in range(n_epochs): n_samples = 0 # Shuffle and get training stuff kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(1.) x_batch = [train[0][t] for t in train_index] y_batch = [train[1][t] for t in train_index] y_f_batch = [train[1][t] + i * st_data.dictionary.num_words() for i, t in enumerate(train_index)] # Convert x and y into numpy objects x_batch = numpy.asarray(x_batch, dtype='int32') y_batch = numpy.asarray(y_batch, dtype='int32') y_f_batch = numpy.asarray(y_f_batch, dtype='int32') local_batch_size = x_batch.shape[0] if use_nce: # Create noise samples to be passed as well # Expected size is (bs, k) # Don't sample UNK and PAD noisy_samples = numpy.zeros((local_batch_size, st_data.dictionary.num_words()), dtype='float32') # The following will mask approximately (repeats permitted) 100 # values with the value 1. This represents the noise samples in # the vocab noisy_samples[ numpy.arange(local_batch_size).reshape(local_batch_size, 1), numpy.random.randint(2, total_output_words, size=(local_batch_size, nce_k)) ] = 1. loss = f_grad_shared(x_batch, y_batch, y_f_batch, noisy_samples, nce_k) else: loss = f_grad_shared(x_batch, y_batch) f_update(learning_rate) if numpy.isnan(loss) or numpy.isinf(loss): print('bad cost detected: ', loss) return 1., 1. if numpy.mod(uidx, disp_freq) == 0: print('Epoch', eidx, 'Update', uidx, 'Cost', loss) end_time = time.time() print('Training took %.1fs' % (end_time - start_time)) f_grad_shared.profile.print_summary()
def __init__(self, rng, input, n_in, n_h1, n_h2, n_out, use_dropout=False, trng=None, dropout_p=0.5, use_noise=theano.shared(numpy_floatX(0.)), use_nce=False): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # This first hidden layer # The input is the concatenated word embeddings for all # words in the context input and the batch self.h1 = DenseLayer(rng=rng, input=input, n_in=n_in, n_out=n_h1, activation=T.nnet.relu) # Use dropout if specified h2_input = self.h1.output if (use_dropout): assert trng is not None h2_input = dropout_layer(self.h1.output, use_noise, trng, dropout_p) # The second hidden layer self.h2 = DenseLayer(rng=rng, input=h2_input, n_in=n_h1, n_out=n_h2, activation=T.nnet.relu) # Apply dropout if specified log_reg_input = self.h2.output if (use_dropout): log_reg_input = dropout_layer(self.h2.output, use_noise, trng, dropout_p) # The logistic regression layer self.log_regression_layer = LogisticRegression(input=log_reg_input, n_in=n_h2, n_out=n_out) # Use L2 regularization, for the log-regression layer only self.L2 = reg.L2([self.log_regression_layer.W]) # Get the NLL loss function from the logistic regression layer if use_nce: self.loss = self.log_regression_layer.nce_loss else: self.loss = self.log_regression_layer.loss # Bundle params (to be used for computing gradients) self.params = self.h1.params + self.h2.params + \ self.log_regression_layer.params # Keeo track of the input (For debugging only) self.input = input
def build_model(self): trng = RandomStreams(self.random_seed) use_noise = theano.shared(numpy_floatX(0.)) # Simply encode this x = T.matrix('x', dtype='int64') y = T.matrix('y', dtype='int64') y_prime = T.roll(y, -1, 0) # Since we are simply predicting the next word, the # following statement shifts the content of the x by 1 # in the time dimension for prediction (axis 0, assuming TxN) mask_x = T.matrix('mask_x', dtype=theano.config.floatX) mask_y = T.matrix('mask_y', dtype=theano.config.floatX) n_timesteps = x.shape[0] n_samples = x.shape[1] # Convert word indices to their embeddings # Resulting dims are (T x N x dim_proj) emb = self.tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, self.dim_proj]) # Compute the hidden states # Note that these contain hidden states for elements which were # padded in input. The cost for these time steps are removed # before the calculation of the cost. enc_proj_1 = self.layers['enc_lstm_1'].lstm_layer(emb, self.dim_proj, mask=mask) # Use dropout on non-recurrent connections (Zaremba et al.) if self.use_dropout: proj_1 = dropout_layer(enc_proj_1, use_noise, trng) enc_proj_2 = self.layers['enc_lstm_2'].lstm_layer(enc_proj_1, self.dim_proj, mask=mask) if self.use_dropout: enc_proj_2 = dropout_layer(enc_proj_2, use_noise, trng) # Use the final state of the encoder as the initial hidden state of the decoder src_embedding = enc_proj_2[-1] # Run decoder LSTM dec_proj_1 = self.layers['enc_lstm_1'].lstm_layer(emb, self.dim_proj, mask=mask) # Use dropout on non-recurrent connections (Zaremba et al.) if self.use_dropout: proj_1 = dropout_layer(enc_proj_1, use_noise, trng) enc_proj_2 = self.layers['enc_lstm_2'].lstm_layer(enc_proj_1, self.dim_proj, mask=mask) if self.use_dropout: enc_proj_2 = dropout_layer(enc_proj_2, use_noise, trng) pre_s = T.dot(proj, self.tparams['U']) + self.tparams['b'] # Softmax works for 2-tensors (matrices) only. We have a 3-tensor # TxNxV. So we reshape it to (T*N)xV, apply softmax and reshape again # -1 is a proxy for infer dim based on input (numpy style) pre_s_r = T.reshape(pre_s, (pre_s.shape[0] * pre_s.shape[1], -1)) pred_r = T.nnet.softmax(pre_s_r) off = 1e-8 if pred_r.dtype == 'float16': off = 1e-6 # Note the use of flatten here. We can't directly index a 3-tensor # and hence we use the (T*N)xV view which is indexed by the flattened # label matrix, dim = (T*N)x1 # Also, the cost (before calculating the mean) is multiplied (element-wise) # with the mask to eliminate the cost of elements that do not really exist. # i.e. Do not include the cost for elements which are padded cost = -T.sum(T.log(pred_r[T.arange(pred_r.shape[0]), y.flatten()] + off) * mask.flatten()) / T.sum(mask) self.f_cost = theano.function([x, mask], cost, name='f_cost') return use_noise, x, mask, cost
def rmsprop(lr, tparams, grads, cost, *args): """ A variant of SGD that scales the step size by running average of the recent step norms. Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs mask: Theano variable Sequence mask y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [Hint2014]_. .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, lecture 6a, http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf """ zipped_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items() ] running_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.items() ] running_grads2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] grad_input = list(args) f_grad_shared = theano.function(grad_input, cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared') updir = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.items() ] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / T.sqrt(rg2 - rg**2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') return f_grad_shared, f_update
def adadelta(lr, tparams, grads, cost, *args): """ An adaptive learning rate optimizer Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs mask: Theano variable Sequence mask y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [ADADELTA]_. .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning Rate Method*, arXiv:1212.5701. """ zipped_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items() ] running_up2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.items() ] running_grads2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] grad_input = list(args) f_grad_shared = theano.function(grad_input, cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [ -T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2) ] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def __init__(self, rng, input, n_in, n_h1, n_h2, n_out, use_dropout=False, trng=None, dropout_p=0.5, use_noise=theano.shared(numpy_floatX(0.)), use_nce=False): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # This first hidden layer # The input is the concatenated word embeddings for all # words in the context input and the batch self.h1 = DenseLayer( rng=rng, input=input, n_in=n_in, n_out=n_h1, activation=T.nnet.relu ) # Use dropout if specified h2_input = self.h1.output if (use_dropout): assert trng is not None h2_input = dropout_layer(self.h1.output, use_noise, trng, dropout_p) # The second hidden layer self.h2 = DenseLayer( rng=rng, input=h2_input, n_in=n_h1, n_out=n_h2, activation=T.nnet.relu ) # Apply dropout if specified log_reg_input = self.h2.output if (use_dropout): log_reg_input = dropout_layer(self.h2.output, use_noise, trng, dropout_p) # The logistic regression layer self.log_regression_layer = LogisticRegression( input=log_reg_input, n_in=n_h2, n_out=n_out ) # Use L2 regularization, for the log-regression layer only self.L2 = reg.L2([self.log_regression_layer.W]) # Get the NLL loss function from the logistic regression layer if use_nce: self.loss = self.log_regression_layer.nce_loss else: self.loss = self.log_regression_layer.loss # Bundle params (to be used for computing gradients) self.params = self.h1.params + self.h2.params + \ self.log_regression_layer.params # Keeo track of the input (For debugging only) self.input = input
def build_model(self): trng = RandomStreams(self.random_seed) use_noise = theano.shared(numpy_floatX(0.)) x = T.matrix('x', dtype='int64') # Since we are simply predicting the next word, the # following statement shifts the content of the x by 1 # in the time dimension for prediction (axis 0, assuming TxN) y = T.roll(x, -1, 0) mask = T.matrix('mask', dtype=theano.config.floatX) n_timesteps = x.shape[0] n_samples = x.shape[1] # Convert word indices to their embeddings # Resulting dims are (T x N x dim_proj) emb = self.tparams['Wemb'][x.flatten()].reshape( [n_timesteps, n_samples, self.dim_proj]) # Dropout input if necessary if self.use_dropout: emb = dropout_layer(emb, use_noise, trng) # Compute the hidden states # Note that these contain hidden states for elements which were # padded in input. The cost for these time steps are removed # before the calculation of the cost. proj_1 = self.layers['lstm_1'].lstm_layer( emb, mask=mask, restore_final_to_initial_hidden=True) # Use dropout on non-recurrent connections (Zaremba et al.) if self.use_dropout: proj_1 = dropout_layer(proj_1, use_noise, trng) proj = self.layers['lstm_2'].lstm_layer( proj_1, mask=mask, restore_final_to_initial_hidden=True) if self.use_dropout: proj = dropout_layer(proj, use_noise, trng) pre_s_lstm = self.layers['logit_lstm'].logit_layer(proj) pre_s_input = self.layers['logit_prev_word'].logit_layer(emb) pre_s = self.layers['logit'].logit_layer( T.tanh(pre_s_lstm + pre_s_input)) # Softmax works for 2-tensors (matrices) only. We have a 3-tensor # TxNxV. So we reshape it to (T*N)xV, apply softmax and reshape again # -1 is a proxy for infer dim based on input (numpy style) pre_s_r = T.reshape(pre_s, (pre_s.shape[0] * pre_s.shape[1], -1)) pred_r = T.nnet.softmax(pre_s_r) off = 1e-8 if pred_r.dtype == 'float16': off = 1e-6 # Note the use of flatten here. We can't directly index a 3-tensor # and hence we use the (T*N)xV view which is indexed by the flattened # label matrix, dim = (T*N)x1 # Also, the cost (before calculating the mean) is multiplied (element-wise) # with the mask to eliminate the cost of elements that do not really exist. # i.e. Do not include the cost for elements which are padded cost = -T.sum( T.log(pred_r[T.arange(pred_r.shape[0]), y.flatten()] + off) * mask.flatten()) / T.sum(mask) self.f_cost = theano.function([x, mask], cost, name='f_cost') return use_noise, x, mask, cost
def build_decode(self): # Input to start the recurrence with trng = RandomStreams(self.random_seed) use_noise = theano.shared(numpy_floatX(0.)) x = T.matrix('x', dtype='int64') # Number of steps we want the recurrence to run for n_timesteps = T.iscalar('n_timesteps') n_samples = x.shape[1] # The mask for the first layer has to be all 1s. # It does not make sense to complete a sentence for which # The mask is 1 1 0 (because it's already complete). mask = T.matrix('mask', dtype=theano.config.floatX) # This is a dummy mask, we want to consider all hidden states for # the second layer when decoding mask_2 = T.alloc(numpy_floatX(1.), n_timesteps, n_samples) emb = self.tparams['Wemb'][x.flatten()].reshape( [x.shape[0], x.shape[1], self.dim_proj]) def output_to_input_transform(output, emb): """ output : The previous hidden state (Nxd) """ # N X V pre_soft_lstm = self.layers['logit_lstm'].logit_layer(output) pre_soft_input = self.layers['logit_prev_word'].logit_layer(emb) pre_soft = self.layers['logit'].logit_layer( T.tanh(pre_s_lstm + pre_s_input)) pred = T.nnet.softmax(pre_soft) # N x 1 pred_argmax = pred.argmax(axis=1) # N x d (flatten is probably redundant) new_input = self.tparams['Wemb'][pred_argmax.flatten()].reshape( [n_samples, self.dim_proj]) return new_input proj_1 = self.layers['lstm_1'].lstm_layer( emb, self.dim_proj, mask=mask, n_steps=n_timesteps, output_to_input_func=output_to_input_transform) if self.use_dropout: proj_1 = dropout_layer(proj_1, use_noise, trng) proj = self.layers['lstm_2'].lstm_layer(proj_1, self.dim_proj, mask=mask_2) if self.use_dropout: proj = dropout_layer(proj, use_noise, trng) pre_s_lstm = self.layers['logit_lstm'].logit_layer(proj) pre_s_input = self.layers['logit_prev_word'].logit_layer(emb) pre_s = self.layers['logit'].logit_layer( T.tanh(pre_s_lstm + pre_s_input)) # Softmax works for 2-tensors (matrices) only. We have a 3-tensor # TxNxV. So we reshape it to (T*N)xV, apply softmax and reshape again # -1 is a proxy for infer dim based on input (numpy style) pre_s_r = T.reshape(pre_s, (pre_s.shape[0] * pre_s.shape[1], -1)) # Softmax will receive all-0s for previously padded entries # (T*N) x V pred_r = T.nnet.softmax(pre_s_r) # T x N pred = (T.reshape(pred_r, pre_s.shape)[:, :, 2:]).argmax(axis=2) + 2 self.f_decode = theano.function([x, mask, n_timesteps], pred, name='f_decode') return use_noise, x, mask, n_timesteps
def gru_layer(self, state_below, mask=None, n_steps=None, output_to_input_func=None, restore_final_to_initial_hidden=False): """ Recurrence with an LSTM hidden unit state_below : Is the input. This may be a single sample with multiple timesteps, or a batch mask : The mask applied to the input for batching n_steps : The number of steps for which this recurrence should be run This is only required with partial input. For any step where no input is available, the output_to_input_func is applied to the previous output and is then used as input output_to_input_func : The function to be applied to generate input when partial input is available restore_final_to_initial_hidden : Use the final hidden state as the initial hidden state for the next batch WARNING : Assumes that batches are of the same size since the size of the initial state is fixed to Nxd TODO: Possibly think about averaging final states to make this number of sample independent """ # Make sure that we've initialized the tparams assert len(self.tparams) > 0 # State below : steps x samples x dim_proj # If n_steps is not provided, infer it if n_steps is None: nsteps = state_below.shape[0] else: # If n_steps is provided, this is the incomplete input setting # Make sure that a function is provided to transform output # from previous time step to input # TODO: This output function may require input from several time # steps instead of just the previous one. Make this modification nsteps = n_steps if output_to_input_func is None: raise Exception('n_steps was given to the GRU but no output \ to input function was specified') # Hack to make sure that the theano ifelse compiles if output_to_input_func is None: output_to_input_func = dummy_func # Check if the input is a batch or a single sample if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 warnings.warn("You seem to be supplying single samples for \ recurrence. You may see speedup gains with using \ batches instead.") # Initialize mask if not specified if mask is None: if state_below.ndim == 3: mask = T.alloc(numpy_floatX(1.), nsteps, n_samples) else: mask = T.alloc(numpy_floatX(1.), n_samples) # Initialize initial hidden state if not specified # Restore final hidden state to new initial hidden state if restore_final_to_initial_hidden and self.h_final is not None: h0 = self.h_final else: h0 = T.alloc(numpy_floatX(0.), n_samples, self.dim_proj) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] # TODO: Initialize mask if it is none # TODO; Make the same change to the LSTM module def _step(t_, h_, mask, state_below, state_below_h_c): """ m_ is the mask for this timestep (N x 1) x_ is the input for this time step (pre-multiplied with the weight matrices). ie. x_ = (X.W + b)[t] h_ is the previous hidden state c_ is the previous LSTM context """ preact = T.dot(h_, self.tparams[_p(self.prefix, 'U')]) x_ = ifelse( T.lt(t_, state_below.shape[0]), state_below[t_], T.dot(output_to_input_func(h_), self.tparams[_p( self.prefix, 'W')]) + self.tparams[_p(self.prefix, 'b')]) preact += x_ # The input to the sigmoid is preact[:, :, 0:d] # Similar slices are used for the rest of the gates r = T.nnet.sigmoid(_slice(preact, 0, self.dim_proj)) z = T.nnet.sigmoid(_slice(preact, 1, self.dim_proj)) # The proposal hidden state preact_h = T.dot(h_, self.tparams[_p(self.prefix, 'U_h')]) preact_h = preact_h * r h_c_ = ifelse( T.lt(t_, state_below_h_c.shape[0]), state_below_h_c[t_], T.dot(output_to_input_func(h_), self.tparams[_p( self.prefix, 'W_h')]) + self.tparams[_p(self.prefix, 'b_h')]) # TODO : xx_ preact_h = preact_h + h_c_ h = T.tanh(preact_h) h = z * h_ + (1 - z) * h # None adds a dimension to the mask (N,) -> (N, 1) # Where the mask value is 1, use the value of the current # context, otherwise use the one from the previous # context when the mask value is 0 # This will ensure that values generated for absent # elements marked with <PAD> will not be used # Similarly, Where the mask value is 1, use the value of the current # hidden state, otherwise use the one from the previous # state when the mask value is 0 h = ifelse(T.lt(t_, state_below.shape[0]), mask[t_][:, None] * h + (1. - mask[t_])[:, None] * h_, h) return h state_below = (T.dot(state_below, self.tparams[_p(self.prefix, 'W')]) + self.tparams[_p(self.prefix, 'b')]) # Transformation to calculate the candidate hidden state state_below_h_c = ( T.dot(state_below, self.tparams[_p(self.prefix, 'W_h')]) + self.tparams[_p(self.prefix, 'b_h')]) rval, updates = theano.scan( _step, sequences=[T.arange(nsteps)], outputs_info=[h0], non_sequences=[mask, state_below, state_below_h_c], name=_p(self.prefix, '_layers'), n_steps=nsteps) # Save the final state to be used as the next initial hidden state if restore_final_to_initial_hidden: self.h_final = rval[0][-1] # Returns a list of the hidden states (t elements of N x dim_proj) return rval[0]
def rmsprop(lr, tparams, grads, cost, *args): """ A variant of SGD that scales the step size by running average of the recent step norms. Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs mask: Theano variable Sequence mask y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [Hint2014]_. .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, lecture 6a, http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf """ zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items()] running_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.items()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] grad_input = list(args) f_grad_shared = theano.function(grad_input, cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared') updir = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.items()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / T.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') return f_grad_shared, f_update