def create_model(self): input_dim = self.input_dim x = self.x y = self.y p = self.p mask = self.mask hidden_dim = self.hidden_dim embedding_dim = self.embedding_dim lookup = LookupTable(self.dict_size, embedding_dim, weights_init=IsotropicGaussian(0.001), name='LookupTable') x_to_h = Linear(embedding_dim, hidden_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) lstm = LSTM(hidden_dim, name='lstm', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) h_to_o = MLP([Logistic()], [hidden_dim, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0), name='h_to_o') lookup.initialize() x_to_h.initialize() lstm.initialize() h_to_o.initialize() embed = lookup.apply(x).reshape( (x.shape[0], x.shape[1], self.embedding_dim)) embed.name = "embed_vec" x_transform = x_to_h.apply(embed.transpose(1, 0, 2)) x_transform.name = "Transformed X" self.lookup = lookup self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o #if mask is None: h, c = lstm.apply(x_transform) #else: #h, c = lstm.apply(x_transform, mask=mask) h.name = "hidden_state" c.name = "cell state" # only values of hidden units of the last timeframe are used for # the classification indices = T.sum(mask, axis=0) - 1 rel_hid = h[indices, T.arange(h.shape[1])] out = self.h_to_o.apply(rel_hid) probs = out return probs
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
def construct_model(vocab_size, embedding_dim, ngram_order, hidden_dims, activations): # Construct the model x = tensor.lmatrix('features') y = tensor.lvector('targets') lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup') hidden = MLP(activations=activations + [None], dims=[ngram_order * embedding_dim] + hidden_dims + [vocab_size]) embeddings = lookup.apply(x) embeddings = embeddings.flatten(ndim=2) # Concatenate embeddings activations = hidden.apply(embeddings) cost = Softmax().categorical_cross_entropy(y, activations) # Initialize parameters lookup.weights_init = IsotropicGaussian(0.001) hidden.weights_init = IsotropicGaussian(0.01) hidden.biases_init = Constant(0.001) lookup.initialize() hidden.initialize() return cost
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') #embed.weights_init = IsotropicGaussian(0.01) embed.weights_init = Constant( init_embedding_table(filename='embeddings/vocab_embeddings.txt')) # one directional LSTM encoding q_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='q_lstm_in') q_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='q_lstm') c_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='c_lstm_in') c_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='c_lstm') bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins] q_tmp = q_lstm_ins.apply(embed.apply(question)) c_tmp = c_lstm_ins.apply(embed.apply(context)) q_hidden, _ = q_lstm.apply(q_tmp, mask=question_mask.astype( theano.config.floatX)) # lq, bs, dim c_hidden, _ = c_lstm.apply(c_tmp, mask=context_mask.astype( theano.config.floatX)) # lc, bs, dim # Attention mechanism Bilinear question attention_question = Linear(input_dim=config.pre_lstm_size, output_dim=config.pre_lstm_size, name='att_question') bricks += [attention_question] att_weights_question = q_hidden[ None, :, :, :] * attention_question.apply( c_hidden.reshape( (c_hidden.shape[0] * c_hidden.shape[1], c_hidden.shape[2]))).reshape( (c_hidden.shape[0], c_hidden.shape[1], c_hidden.shape[2]))[:, None, :, :] # --> lc,lq,bs,dim att_weights_question = att_weights_question.sum( axis=3) # sum over axis 3 -> dimensions --> lc,lq,bs att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,bs,lq att_weights_question = att_weights_question.reshape( (att_weights_question.shape[0] * att_weights_question.shape[1], att_weights_question.shape[2])) # --> lc*bs,lq att_weights_question = tensor.nnet.softmax( att_weights_question ) # softmax over axis 1 -> length of question # --> lc*bs,lq att_weights_question = att_weights_question.reshape( (c_hidden.shape[0], q_hidden.shape[1], q_hidden.shape[0])) # --> lc,bs,lq att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,lq,bs attended_question = tensor.sum( q_hidden[None, :, :, :] * att_weights_question[:, :, :, None], axis=1) # sum over axis 1 -> length of question --> lc,bs,dim attended_question.name = 'attended_question' # Match LSTM cqembed = tensor.concatenate([c_hidden, attended_question], axis=2) mlstms, mhidden_list = make_bidir_lstm_stack( cqembed, 2 * config.pre_lstm_size, context_mask.astype(theano.config.floatX), config.match_lstm_size, config.match_skip_connections, 'match') bricks = bricks + mlstms if config.match_skip_connections: menc_dim = 2 * sum(config.match_lstm_size) menc = tensor.concatenate(mhidden_list, axis=2) else: menc_dim = 2 * config.match_lstm_size[-1] menc = tensor.concatenate(mhidden_list[-2:], axis=2) menc.name = 'menc' # Attention mechanism MLP start attention_mlp_start = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_start') attention_clinear_start = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], name='attm_start') # Wym bricks += [attention_mlp_start, attention_clinear_start] layer1_start = Tanh(name='layer1_start') layer1_start = layer1_start.apply( attention_clinear_start.apply( menc.reshape( (menc.shape[0] * menc.shape[1], menc.shape[2]))).reshape( (menc.shape[0], menc.shape[1], config.attention_mlp_hidden[0]))) att_weights_start = attention_mlp_start.apply( layer1_start.reshape( (layer1_start.shape[0] * layer1_start.shape[1], layer1_start.shape[2]))) att_weights_start = att_weights_start.reshape( (layer1_start.shape[0], layer1_start.shape[1])) att_weights_start = tensor.nnet.softmax(att_weights_start.T).T attended = tensor.sum(menc * att_weights_start[:, :, None], axis=0) attended.name = 'attended' # Attention mechanism MLP end attention_mlp_end = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_end') attention_qlinear_end = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], name='atts_end') #Wum attention_clinear_end = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attm_end') # Wym bricks += [ attention_mlp_end, attention_qlinear_end, attention_clinear_end ] layer1_end = Tanh(name='layer1_end') layer1_end = layer1_end.apply( attention_clinear_end.apply( menc.reshape((menc.shape[0] * menc.shape[1], menc.shape[2] ))).reshape((menc.shape[0], menc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_end.apply(attended)[None, :, :]) att_weights_end = attention_mlp_end.apply( layer1_end.reshape((layer1_end.shape[0] * layer1_end.shape[1], layer1_end.shape[2]))) att_weights_end = att_weights_end.reshape( (layer1_end.shape[0], layer1_end.shape[1])) att_weights_end = tensor.nnet.softmax(att_weights_end.T).T att_weights_start = tensor.dot( tensor.le( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_weights_start) att_weights_end = tensor.dot( tensor.ge( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_weights_end) # add attention from left and right att_weights = att_weights_start * att_weights_end #att_weights = tensor.minimum(att_weights_start, att_weights_end) att_target = tensor.zeros((ans_indices.shape[1], context.shape[0]), dtype=theano.config.floatX) att_target = tensor.set_subtensor( att_target[tensor.arange(ans_indices.shape[1]), ans_indices], 1) att_target = att_target.dimshuffle(1, 0) #att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)), # tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1) self.predictions = tensor.gt(att_weights, 0.25) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) #att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, mhidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_weights_start.name = 'att_weights_start' att_weights_end.name = 'att_weights_end' att_weights.name = 'att_weights' att_target.name = 'att_target' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars = [ cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target ] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def create_model(self, symbols_num = 500): # Hyperparameters # The dimension of the hidden state of the GRUs in each direction. hidden_states = self.args.encoder_hidden_dims # Dimension of the word-embedding space embedding_dims = self.args.source_embeddings_dim ################### # Declaration of the Theano variables that come from the data stream ################### # The context document. context_bt = tt.lmatrix('context') # Context document mask used to distinguish real symbols from the sequence and padding symbols that are at the end context_mask_bt = tt.matrix('context_mask') # The question question_bt = tt.lmatrix('question') question_mask_bt = tt.matrix('question_mask') # The correct answer y = tt.lmatrix('answer') y = y[:,0] # originally answers are in a 2d matrix, here we convert it to a vector # The candidates among which the answer is selected candidates_bi = tt.lmatrix("candidates") candidates_bi_mask = tt.matrix("candidates_mask") ################### # Network's components ################### # Lookup table with randomly initialized word embeddings lookup = LookupTable(symbols_num, embedding_dims, weights_init=Uniform(width=0.2)) # bidirectional encoder that translates context context_encoder = self.create_bidi_encoder("context_encoder", embedding_dims, hidden_states) # bidirectional encoder for question question_encoder = self.create_bidi_encoder("question_encoder", embedding_dims, hidden_states) # Initialize the components (where not done upon creation) lookup.initialize() ################### # Wiring the components together # # Where present, the 3 letters at the end of the variable name identify its dimensions: # b ... position of the example within the batch # t ... position of the word within the document/question # f ... features of the embedding vector ################### ### Read the context document # Map token indices to word embeddings context_embedding_tbf = lookup.apply(context_bt.T) # Read the embedded context document using the bidirectional GRU and produce the contextual embedding of each word memory_encoded_btf = context_encoder.apply(context_embedding_tbf, context_mask_bt.T).dimshuffle(1,0,2) memory_encoded_btf.name = "memory_encoded_btf" ### Correspondingly, read the query x_embedded_tbf = lookup.apply(question_bt.T) x_encoded_btf = question_encoder.apply(x_embedded_tbf, question_mask_bt.T).dimshuffle(1,0,2) # The query encoding is a concatenation of the final states of the forward and backward GRU encoder x_forward_encoded_bf = x_encoded_btf[:,-1,0:hidden_states] x_backward_encoded_bf = x_encoded_btf[:,0,hidden_states:hidden_states*2] query_representation_bf = tt.concatenate([x_forward_encoded_bf,x_backward_encoded_bf],axis=1) # Compute the attention on each word in the context as a dot product of its contextual embedding and the query mem_attention_presoft_bt = tt.batched_dot(query_representation_bf, memory_encoded_btf.dimshuffle(0,2,1)) # TODO is this pre-masking necessary? mem_attention_presoft_masked_bt = tt.mul(mem_attention_presoft_bt,context_mask_bt) # Normalize the attention using softmax mem_attention_bt = SoftmaxWithMask(name="memory_query_softmax").apply(mem_attention_presoft_masked_bt,context_mask_bt) if self.args.weighted_att: # compute weighted attention over original word vectors att_weighted_responses_bf = theano.tensor.batched_dot(mem_attention_bt, context_embedding_tbf.dimshuffle(1,0,2)) # compare desired response to all candidate responses # select relevant candidate answer words candidates_embeddings_bfi = lookup.apply(candidates_bi).dimshuffle(0,2,1) # convert it to output symbol probabilities y_hat_presoft = tt.batched_dot(att_weighted_responses_bf, candidates_embeddings_bfi) y_hat = SoftmaxWithMask(name="output_softmax").apply(y_hat_presoft,candidates_bi_mask) else: # Sum the attention of each candidate word across the whole context document, # this is the key innovation of the model # TODO: Get rid of sentence-by-sentence processing? # TODO: Rewrite into matrix notation instead of scans? def sum_prob_of_word(word_ix, sentence_ixs, sentence_attention_probs): word_ixs_in_sentence = tt.eq(sentence_ixs,word_ix).nonzero()[0] return sentence_attention_probs[word_ixs_in_sentence].sum() def sum_probs_single_sentence(candidate_indices_i, sentence_ixs_t, sentence_attention_probs_t): result, updates = theano.scan( fn=sum_prob_of_word, sequences=[candidate_indices_i], non_sequences=[sentence_ixs_t, sentence_attention_probs_t]) return result def sum_probs_batch(candidate_indices_bt,sentence_ixs_bt, sentence_attention_probs_bt): result, updates = theano.scan( fn=sum_probs_single_sentence, sequences=[candidate_indices_bt, sentence_ixs_bt, sentence_attention_probs_bt], non_sequences=None) return result # Sum the attention of each candidate word across the whole context document y_hat = sum_probs_batch(candidates_bi, context_bt, mem_attention_bt) y_hat.name = "y_hat" # We use the convention that ground truth is always at index 0, so the following are the target answers y = y.zeros_like() # We use Cross Entropy as the training objective cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost.name = "cost" predicted_response_index = tt.argmax(y_hat,axis=1) accuracy = tt.eq(y,predicted_response_index).mean() accuracy.name = "accuracy" return cost, accuracy, mem_attention_bt, y_hat, context_bt, candidates_bi, candidates_bi_mask, y, context_mask_bt, question_bt, question_mask_bt
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate([ cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2) clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism Bilinear attention_clinear_1 = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc_1') bricks += [attention_clinear_1] att_start = qenc[None, :, :] * attention_clinear_1.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_start = att_start.sum(axis=2) att_start = tensor.nnet.softmax(att_start.T).T attention_clinear_2 = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc_2') bricks += [attention_clinear_2] att_end = qenc[None, :, :] * attention_clinear_2.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_end = att_end.sum(axis=2) att_end = tensor.nnet.softmax(att_end.T).T att_start = tensor.dot( tensor.le( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_start) att_end = tensor.dot( tensor.ge( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_end) # add attention from left and right att_weights = att_start * att_end att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) self.predictions = tensor.gt(att_weights, 0.25) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) #cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() cost = (((att_weights - att_target)**2) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_start.name = 'att_start' att_end.name = 'att_end' att_weights.name = 'att_weights' att_target.name = 'att_target' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars = [ cost, self.predictions, att_start, att_end, att_weights, att_target ] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
class NeuralLM: def __init__(self, x, y, vocab_size, hidden_size, num_layers, pretrained_embeds=None): """ Implements a neural language model using an LSTM. Word y_n+1 ~ Softmax(U * h_n) :param x A minibatch: each row is an instance (a sequence), with batch_size rows :param y x shifted by 1, which are the target words to predict for the language modeling objective based on the hidden LSTM state :param vocab_size The number of types in the training data :param hidden_size The dimensionality of the word embeddings :param pretrained_embeds Pretrained embeddings for initailization as an ND array """ self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_layers = num_layers # Initialize the word embedding table. If we have pretrained embeddings, we use those self.word_embedding_lookup = LookupTable(length=vocab_size, dim=hidden_size, name="word_embeddings") if pretrained_embeds is None: initialize(self.word_embedding_lookup, 0.8) else: assert pretrained_embeds.shape[0] == vocab_size and pretrained_embeds.shape[1] == hidden_size self.word_embedding_lookup.weights_init = Constant(pretrained_embeds) self.word_embedding_lookup.biases_init = Constant(0) self.word_embedding_lookup.initialize() self.word_embeddings = self.word_embedding_lookup.W self.y_hat, self.cost, self.cells = self.nn_fprop(x, y, num_layers) def lstm_layer(self, h, n): """ Performs the LSTM update for a batch of word sequences :param h The word embeddings for this update :param n The number of layers of the LSTM """ # Maps the word embedding to a dimensionality to be used in the LSTM linear = Linear(input_dim=self.hidden_size, output_dim=self.hidden_size * 4, name='linear_lstm' + str(n)) initialize(linear, sqrt(6.0 / (5 * self.hidden_size))) lstm = LSTM(dim=self.hidden_size, name='lstm' + str(n)) initialize(lstm, 0.08) return lstm.apply(linear.apply(h)) def softmax_layer(self, h, y): """ Perform Softmax over the hidden state in order to predict the next word in the sequence and compute the loss. :param h The hidden state sequence :param y The target words """ hidden_to_output = Linear(name='hidden_to_output', input_dim=self.hidden_size, output_dim=self.vocab_size) initialize(hidden_to_output, sqrt(6.0 / (self.hidden_size + self.vocab_size))) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax(name="lm_softmax") y_hat = softmax.log_probabilities(linear_output, extra_ndim=1) y_hat.name = 'y_hat' cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean() cost.name = 'cost' return y_hat, cost def nn_fprop(self, x, y, num_layers): h = T.nnet.sigmoid(self.word_embedding_lookup.apply(x)) # constrain the word embeddings cells = [] for i in range(num_layers): h, c = self.lstm_layer(h, i) cells.append(c) return self.softmax_layer(h, y) + (cells, ) @property def cost(self): return self.cost @property def embeddings(self): return self.word_embeddings
def _build_lookup(self, name, word_num, dim=1, *args, **kwargs): lookup = LookupTable(length=word_num, dim=dim, name=name) lookup.weights_init = Constant(1. / word_num**0.25) lookup.initialize() return lookup
def run(epochs=1, corpus="data/", HIDDEN_DIMS=100, path="./"): brown = BrownDataset(corpus) INPUT_DIMS = brown.get_vocabulary_size() OUTPUT_DIMS = brown.get_vocabulary_size() # These are theano variables x = tensor.lmatrix('context') y = tensor.ivector('output') # Construct the graph input_to_hidden = LookupTable(name='input_to_hidden', length=INPUT_DIMS, dim=HIDDEN_DIMS) # Compute the weight matrix for every word in the context and then compute # the average. h = tensor.mean(input_to_hidden.apply(x), axis=1) hidden_to_output = Linear(name='hidden_to_output', input_dim=HIDDEN_DIMS, output_dim=OUTPUT_DIMS) y_hat = Softmax().apply(hidden_to_output.apply(h)) # And initialize with random varibales and set the bias vector to 0 weights = IsotropicGaussian(0.01) input_to_hidden.weights_init = hidden_to_output.weights_init = weights input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # And now the cost function cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' mini_batch = SequentialScheme(brown.num_instances(), 512) data_stream = DataStream.default_stream(brown, iteration_scheme=mini_batch) # Now we tie up lose ends and construct the algorithm for the training # and define what happens in the main loop. algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ ProgressBar(), FinishAfter(after_n_epochs=epochs), Printing(), # TrainingDataMonitoring(variables=[cost]), SaveWeights(layers=[input_to_hidden, hidden_to_output], prefixes=['%sfirst' % path, '%ssecond' % path]), # Plot( # 'Word Embeddings', # channels=[ # [ # 'cost_with_regularization' # ] # ]) ] logger.info("Starting main loop...") main = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=extensions) main.run() pickle.dump(cg, open('%scg.pickle' % path, 'wb'))
def __init__(self, config, vocab_size): context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) context_bag = to_bag(context, vocab_size) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') #embed.weights_init = Constant(embeddings_initial_value) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Build the encoder bricks transition = GatedRecurrent(activation=Tanh(), dim=config.generator_lstm_size, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=cenc_dim, match_dim=config.generator_lstm_size, name="attention") readout = Readout(readout_dim=vocab_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=MaskedSoftmaxEmitter(context_bag=context_bag, name='emitter'), feedback_brick=LookupFeedback( vocab_size, config.feedback_size), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") cost = generator.cost(answer, answer_mask.astype(theano.config.floatX), attended=cenc, attended_mask=context_mask.astype( theano.config.floatX), name="cost") self.predictions = generator.generate( n_steps=7, batch_size=config.batch_size, attended=cenc, attended_mask=context_mask.astype(theano.config.floatX), iterate=True)[1] # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # initialize new stuff manually (change!) generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0) generator.push_allocation_config() generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def create_model(self): input_dim = self.input_dim x = self.x y = self.y p = self.p mask = self.mask hidden_dim = self.hidden_dim embedding_dim = self.embedding_dim lookup = LookupTable(self.dict_size, embedding_dim, weights_init=IsotropicGaussian(0.001), name='LookupTable') x_to_h = Linear(embedding_dim, hidden_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) lstm = LSTM(hidden_dim, name='lstm', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) h_to_o = MLP([Logistic()], [hidden_dim, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0), name='h_to_o') lookup.initialize() x_to_h.initialize() lstm.initialize() h_to_o.initialize() embed = lookup.apply(x).reshape( (x.shape[0], x.shape[1], self.embedding_dim)) embed.name = "embed_vec" x_transform = x_to_h.apply(embed.transpose(1, 0, 2)) x_transform.name = "Transformed X" self.lookup = lookup self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o #if mask is None: h, c = lstm.apply(x_transform) #else: #h, c = lstm.apply(x_transform, mask=mask) h.name = "hidden_state" c.name = "cell state" # only values of hidden units of the last timeframe are used for # the classification indices = T.sum(mask, axis=0) - 1 rel_hid = h[indices, T.arange(h.shape[1])] out = self.h_to_o.apply(rel_hid) probs = 1 - out probs.name = "probability" y = y.dimshuffle(0, 'x') # Create the if-else cost function pos_ex = (y * probs) / p neg_ex = (1 - y) * (1 - probs) / np.float32(1 - p) reward = pos_ex + neg_ex cost = reward # Negative of reward cost.name = "cost" return cost
x = tensor.imatrix('features') y = tensor.ivector('targets') v = dataset.get_vocab_size() input_to_hidden = LookupTable(name='input_to_hidden', length=v, dim=hidden_size) h = tensor.mean(input_to_hidden.apply(x), axis=1) hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=v) y_hat = Softmax().apply(hidden_to_output.apply(h)) input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (abs(W1)).sum() + 0.005 * (abs(W2)).sum() cost.name = 'cost' algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1))
def main(model_path, recurrent_type): dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, _make_target, add_sources=('target',)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(100)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') target = tensor.lmatrix('target') target_mask = tensor.matrix('target_mask') dim = 100 lookup = LookupTable(len(all_chars), dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) if recurrent_type == 'lstm': rnn = LSTM(dim / 4, Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) elif recurrent_type == 'simple': rnn = SimpleRecurrent(dim, Tanh()) rnn = Bidirectional(rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) else: raise ValueError('Not known RNN type') rnn.initialize() lookup.initialize() y_hat = rnn.apply(lookup.apply(features), mask=features_mask) print len(all_chars) linear = Linear(2 * dim, len(all_chars), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) linear.initialize() y_hat = linear.apply(y_hat) seq_lenght = y_hat.shape[0] batch_size = y_hat.shape[1] y_hat = Softmax().apply(y_hat.reshape((seq_lenght * batch_size, -1))).reshape(y_hat.shape) cost = CategoricalCrossEntropy().apply( target.flatten(), y_hat.reshape((-1, len(all_chars)))) * seq_lenght * batch_size cost.name = 'cost' cost_per_character = cost / features_mask.sum() cost_per_character.name = 'cost_per_character' cg = ComputationGraph([cost, cost_per_character]) model = Model(cost) algorithm = GradientDescent(step_rule=Adam(), cost=cost, params=cg.parameters) train_monitor = TrainingDataMonitoring( [cost, cost_per_character], prefix='train', after_batch=True) extensions = [train_monitor, Printing(every_n_batches=40), Dump(model_path, every_n_batches=200), #Checkpoint('rnn.pkl', every_n_batches=200) ] main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=data_stream, extensions=extensions) main_loop.run()
def main(config): vocab_src, _ = text_to_dict([config['train_src'], config['dev_src'], config['test_src']]) vocab_tgt, cabvo = text_to_dict([config['train_tgt'], config['dev_tgt']]) # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0], [1, 4, 8, 4, 8, 4, 8],] source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0], [1, 0, 1, 0, 1, 0, 1],] target_sentence.tag.test_value = [[0,1,1,5], [2,0,1,0],] target_sentence_mask.tag.test_value = [[0,1,1,0], [1,1,1,0],] logger.info('Building RNN encoder-decoder') ### Building Encoder embedder = LookupTable( length=len(vocab_src), dim=config['embed_src'], weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='embedder') transformer = Linear( config['embed_src'], config['hidden_src']*4, weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='transformer') lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src']) encoder = Bidirectional( LSTM( dim=config['hidden_src'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit)), name='encoderBiLSTM' ) encoder.prototype.weights_init = Orthogonal() ### Building Decoder lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']) transition = LSTM2GO( attended_dim=config['hidden_tgt'], dim=config['hidden_tgt'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit), name='decoderLSTM') attention = SequenceContentAttention( state_names=transition.apply.states, # default activation is Tanh state_dims=[config['hidden_tgt']], attended_dim=config['hidden_src']*2, match_dim=config['hidden_tgt'], name="attention") readout = Readout( source_names=['states', 'feedback', attention.take_glimpses.outputs[0]], readout_dim=len(vocab_tgt), emitter = SoftmaxEmitter( name='emitter'), feedback_brick = LookupFeedback( num_outputs=len(vocab_tgt), feedback_dim=config['embed_tgt'], name='feedback'), post_merge=InitializableFeedforwardSequence([ Bias(dim=config['hidden_tgt'], name='softmax_bias').apply, Linear(input_dim=config['hidden_tgt'], output_dim=config['embed_tgt'], use_bias=False, name='softmax0').apply, Linear(input_dim=config['embed_tgt'], name='softmax1').apply]), merged_dim=config['hidden_tgt']) decoder = SequenceGenerator( readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator", fork=Fork( [name for name in transition.apply.sequences if name != 'mask'], prototype=Linear()), add_contexts=True) decoder.transition.weights_init = Orthogonal() #printchildren(encoder, 1) # Initialize model logger.info('Initializing model') embedder.initialize() transformer.initialize() encoder.initialize() decoder.initialize() # Apply model embedded = embedder.apply(source_sentence) tansformed = transformer.apply(embedded) encoded = encoder.apply(tansformed)[0] generated = decoder.generate( n_steps=2*source_sentence.shape[1], batch_size=source_sentence.shape[0], attended = encoded.dimshuffle(1,0,2), attended_mask=tensor.ones(source_sentence.shape).T ) print 'Generated: ', generated # generator_generate_outputs #samples = generated[1] # For GRU samples = generated[2] # For LSTM samples.name = 'samples' #samples_cost = generated[4] # For GRU samples_cost = generated[5] # For LSTM samples_cost = 'sampling_cost' cost = decoder.cost( mask = target_sentence_mask.T, outputs = target_sentence.T, attended = encoded.dimshuffle(1,0,2), attended_mask = source_sentence_mask.T) cost.name = 'target_cost' cost.tag.aggregation_scheme = TakeLast(cost) model = Model(cost) logger.info('Creating computational graph') cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) ######## # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) printchildren(embedder, 1) printchildren(transformer, 1) printchildren(encoder, 1) printchildren(decoder, 1) # Print parameter names # enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) # enc_dec_param_dict = merge(Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict))) ########## # Training data train_stream = get_train_stream(config, [config['train_src'],], [config['train_tgt'],], vocab_src, vocab_tgt) dev_stream = get_dev_stream( [config['dev_src'],], [config['dev_tgt'],], vocab_src, vocab_tgt) test_stream = get_test_stream([config['test_src'],], vocab_src) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), ProgressBar(), TrainingDataMonitoring([cost], prefix="tra", after_batch=True), DataStreamMonitoring(variables=[cost], data_stream=dev_stream, prefix="dev", after_batch=True), Sampler( model=Model(samples), data_stream=dev_stream, vocab=cabvo, saveto=config['saveto']+'dev', every_n_batches=config['save_freq']), Sampler( model=Model(samples), data_stream=test_stream, vocab=cabvo, saveto=config['saveto']+'test', after_n_batches=1, on_resumption=True, before_training=True), Plotter(saveto=config['saveto'], after_batch=True), Printing(after_batch=True), Checkpoint( path=config['saveto'], parameters = cg.parameters, save_main_loop=False, every_n_batches=config['save_freq'])] if BOKEH_AVAILABLE: Plot('Training cost', channels=[['target_cost']], after_batch=True) if config['reload']: extensions.append(Load(path=config['saveto'], load_iteration_state=False, load_log=False)) else: with open(config['saveto']+'.txt', 'w') as f: pass # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=model, algorithm=algorithm, data_stream=train_stream, extensions=extensions) main_loop.run()
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM( dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01) ) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') y = tensor.imatrix('targets') x_int = x.astype(dtype='int32').T train_dataset = IMDB() idx_sort = numpy.argsort( [len(s) for s in train_dataset.indexables[ train_dataset.sources.index('features')]] ) n_voc = len(train_dataset.dict.keys()) for idx in xrange(len(train_dataset.sources)): train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort] n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=4 * n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = LSTM( dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T) rnn_out = rnn.apply(embedding) rnn_out_mean_pooled = rnn_out[0][-1] probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * tensor.log(probs) + (1 - y) * tensor.log(1 - probs) ).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5) ).mean() misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule( components=[StepClipping(threshold=10.), Adam() ] ) ) n_train = int(numpy.floor(.8 * train_dataset.num_examples)) n_valid = int(numpy.floor(.1 * train_dataset.num_examples)) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(n_train), batch_size=10, ) ), mask_sources=('features',) ) valid_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(n_train, n_train + n_valid), batch_size=10, ) ), mask_sources=('features',) ) test_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(n_train + n_valid, train_dataset.num_examples), batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(DataStreamMonitoring( [cost, misclassification], test_data_stream, prefix='test')) extensions.append(DataStreamMonitoring( [cost, misclassification], valid_data_stream, prefix='valid')) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification', 'valid_cost', 'valid_misclassification']], titles=['Costs'])) extensions.append(PlotManager('IMDB classification example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def _embed(self, sample_num, dim, name, *args, **kwargs): embed = LookupTable(sample_num, dim, name=name) embed.weights_init = IsotropicGaussian(std=1 / numpy.sqrt(dim)) embed.initialize() return embed
def main(mode, save_path, num_batches, from_dump): if mode == "train": # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Data processing pipeline data_stream = DataStreamMapping( mapping=lambda data: tuple(array.T for array in data), data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=lambda data: len(data[0]) <= 100, data_stream=OneBillionWord( "training", [99], char2code, level="character", preprocess=str.lower).get_default_stream()))))) # Build the model chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") encoder = Bidirectional(GatedRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) encoder.initialize() fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.fork_dims = {name: dimension for name in fork.fork_names} fork.initialize() lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) lookup.initialize() transition = Transition(activation=Tanh(), dim=dimension, attended_dim=2 * dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() bricks = [encoder, fork, lookup, generator] # Give an idea of what's going on params = Selector(bricks).get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Build the cost computation graph batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) energies = unpack(VariableFilter(application=readout.readout, name="output")(cg.variables), singleton=True) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy(activations.mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule([ GradientClipping(10.0), SteepestDescent(0.01) ])) observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) main_loop = MainLoop( model=bricks, data_stream=data_stream, algorithm=algorithm, extensions=([LoadFromDump(from_dump)] if from_dump else []) + [ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), TrainingDataMonitoring( observables, prefix="average", every_n_batches=10), FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", lambda log: math.isnan( log.current_row.total_gradient_norm)), Plot(os.path.basename(save_path), [["average_" + cost.name], ["average_" + cost_per_character.name]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": with open(save_path, "rb") as source: encoder, fork, lookup, generator = dill.load(source) logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) sample_function = ComputationGraph(generated).get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=lambda tuple_: -tuple_[0]) for _, message in messages: print(message)
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) # embed.weights_init = Constant(init_embedding_table(filename='embeddings/vocab_embeddings.txt')) # one directional LSTM encoding q_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='q_lstm_in') q_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='q_lstm') c_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='c_lstm_in') c_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='c_lstm') bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins] q_tmp = q_lstm_ins.apply(embed.apply(question)) c_tmp = c_lstm_ins.apply(embed.apply(context)) q_hidden, _ = q_lstm.apply(q_tmp, mask=question_mask.astype( theano.config.floatX)) # lq, bs, dim c_hidden, _ = c_lstm.apply(c_tmp, mask=context_mask.astype( theano.config.floatX)) # lc, bs, dim # Attention mechanism Bilinear question attention_question = Linear(input_dim=config.pre_lstm_size, output_dim=config.pre_lstm_size, name='att_question') bricks += [attention_question] att_weights_question = q_hidden[ None, :, :, :] * attention_question.apply( c_hidden.reshape( (c_hidden.shape[0] * c_hidden.shape[1], c_hidden.shape[2]))).reshape( (c_hidden.shape[0], c_hidden.shape[1], c_hidden.shape[2]))[:, None, :, :] # --> lc,lq,bs,dim att_weights_question = att_weights_question.sum( axis=3) # sum over axis 3 -> dimensions --> lc,lq,bs att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,bs,lq att_weights_question = att_weights_question.reshape( (att_weights_question.shape[0] * att_weights_question.shape[1], att_weights_question.shape[2])) # --> lc*bs,lq att_weights_question = tensor.nnet.softmax( att_weights_question ) # softmax over axis 1 -> length of question # --> lc*bs,lq att_weights_question = att_weights_question.reshape( (c_hidden.shape[0], q_hidden.shape[1], q_hidden.shape[0])) # --> lc,bs,lq att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,lq,bs question_context_attention = att_weights_question.dimshuffle(2, 1, 0) question_context_attention.name = "question_context_attention" self.analyse_vars = [question_context_attention] attended_question = tensor.sum( q_hidden[None, :, :, :] * att_weights_question[:, :, :, None], axis=1) # sum over axis 1 -> length of question --> lc,bs,dim attended_question.name = 'attended_question' # Match LSTM cqembed = tensor.concatenate([c_hidden, attended_question], axis=2) mlstms, mhidden_list = make_bidir_lstm_stack( cqembed, 2 * config.pre_lstm_size, context_mask.astype(theano.config.floatX), config.match_lstm_size, config.match_skip_connections, 'match') bricks = bricks + mlstms if config.match_skip_connections: menc_dim = 2 * sum(config.match_lstm_size) menc = tensor.concatenate(mhidden_list, axis=2) else: menc_dim = 2 * config.match_lstm_size[-1] menc = tensor.concatenate(mhidden_list[-2:], axis=2) menc.name = 'menc' #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) self.theano_params = [] add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_b1'], BIAS) add_role(tparams['ptr_b2'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() #n_steps = length , n_samples = batch_size n_steps = ans_indices.shape[0] n_samples = ans_indices.shape[1] preds, generations = ptr_network( tparams, cqembed, context_mask.astype(theano.config.floatX), ans_indices, ans_indices_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, menc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, ans_indices, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 # probs += (1 - y_mask) # change unmasked position to 1, since log(1) = 0 probs += off # probs_printed = theano.printing.Print('this is probs')(probs) cost = -tensor.log(probs) cost *= ans_indices_mask cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, mhidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' # self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
source_path = 'dataset/normalized_syllables_rhythm_notes.json-seqlen-30.hdf5' train_dataset = T_H5PYDataset(source_path, which_sets=('train', )) hidden_layer_dim = 1000 x = tensor.lmatrix('syllables') y = tensor.lmatrix('durations') lookup_input = LookupTable(name='lookup_input', length=train_dataset.syllables_vocab_size() + 1, dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup_input.initialize() linear_input = Linear(name='linear_input', input_dim=hidden_layer_dim, output_dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_input.initialize() rnn = SimpleRecurrent(name='hidden', dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear(name='linear_output',
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') y = tensor.imatrix('targets') x_int = x.astype(dtype='int32').T - 2 train_dataset = IMDB() idx_sort = numpy.argsort( [len(s) for s in train_dataset.indexables[ train_dataset.sources.index('features')]] ) n_voc = len(train_dataset.dict.keys()) for idx in xrange(len(train_dataset.sources)): train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort] n_h = 10 linear_embedding = LookupTable( length=n_voc, dim=4 * n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = Bidirectional(LSTM( dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) )) rnn.initialize() score_layer = Linear( input_dim=2*n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T) rnn_out = rnn.apply(embedding) rnn_out_mean_pooled = tensor.mean(rnn_out[0], axis=0) probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * tensor.log(probs) + (1 - y) * tensor.log(1 - probs) ).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5) ).mean() misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule( components=[StepClipping(threshold=10.), Adam() ] ) ) n_train = int(numpy.floor(.8 * train_dataset.num_examples)) n_valid = int(numpy.floor(.1 * train_dataset.num_examples)) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(100), batch_size=10, ) ), mask_sources=('features',) ) valid_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(100, 110), batch_size=10, ) ), mask_sources=('features',) ) test_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(110, 120), batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(DataStreamMonitoring( [cost, misclassification], test_data_stream, prefix='test')) extensions.append(DataStreamMonitoring( [cost, misclassification], valid_data_stream, prefix='valid')) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification', 'valid_cost', 'valid_misclassification']], titles=['Costs'])) extensions.append(PlotManager('IMDB classification example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = SimpleRecurrent( dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=Adam() ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) def sampling_step(g_noise, states, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states = rnn.apply(inputs=embedding_step, states=states, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_samples [_, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
train_dataset = MyDataset(source_path, which_sets=('train',)) hidden_layer_dim = 1000 x = tensor.lmatrix('x') y = tensor.lmatrix('y') lookup_input = LookupTable( name='lookup_input', length=charset_size+1, dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup_input.initialize() linear_input = Linear( name='linear_input', input_dim=hidden_layer_dim, output_dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_input.initialize() rnn = SimpleRecurrent( name='hidden', dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize()
def create_model(self, symbols_num = 500): hidden_states = self.args.encoder_hidden_dims embedding_dims = self.args.source_embeddings_dim # dimensions of sequence embeddings that are created bz bidir net, so the dimensionality is two times dim of a single net thought_dim = hidden_states * 2 #query_dims = self.args.recurrent_stack_depth * self.args.encoder_hidden_dims # batch X input symbols context = tt.lmatrix('context') context_mask = tt.matrix('context_mask') context_mask = decorate(context_mask, "context_mask",level=1) # batch X output symbols x = tt.lmatrix('question') x_mask = tt.matrix('question_mask') # answer ix for each example in the batch y = tt.lmatrix('answer') # candidate answer words for each example, batch X candidate words (10 per each example) candidates_bi = tt.lmatrix("candidates") candidates_bi_mask = tt.matrix("candidates_mask") # TODO y can contain long sequences, here we use just the first symbol of each answer (that is possibly longer) # this have to be adjusted when response can be a sequence and not only a symbol y = decorate(y, "output") y = y[:,0] ################### # create model parts ################### lookup = LookupTable(symbols_num, embedding_dims, weights_init=Uniform(width=0.2)) context_encoder = self.create_bidi_encoder("context_encoder", embedding_dims, hidden_states) question_encoder = self.create_bidi_encoder("question_encoder", embedding_dims, hidden_states) # inits lookup.initialize() #rnn.initialize() ################### # wire the model together ################### context = decorate(context, "CONTEXT",1) context_embedding_tbf = lookup.apply(context.T) #memory_encoded_btf = rnn.apply(context_embedding_tbf[:,0,:])[1] # use cells memory_encoded_btf = context_encoder.apply(context_embedding_tbf.T,context_mask).dimshuffle(1,0,2) memory_encoded_btf.name = "memory_encoded_btf" memory_encoded_btf = decorate(memory_encoded_btf,"MEM ENC") # batch X features x = decorate(x,"X") x_embedded_btf = lookup.apply(x.T) x_embedded_btf = decorate(x_embedded_btf,"QUESTION EMB") x_encoded_btf = question_encoder.apply(x_embedded_btf.T, x_mask).dimshuffle(1,0,2) x_last = x_encoded_btf[-1] # extract forward rnn that is the first in bidir encoder x_encoded_btf = decorate(x_encoded_btf,"QUESTION ENC") x_forward_encoded_bf = x_encoded_btf[:,-1,0:hidden_states] x_backward_encoded_bf = x_encoded_btf[:,0,hidden_states:hidden_states*2] query_representation_bf = tt.concatenate([x_forward_encoded_bf,x_backward_encoded_bf],axis=1) # bidirectional representation of question is used as the search key search_key = query_representation_bf #search_key = x_last #search_key = W_um.apply(x_encoded) search_key = decorate(search_key,"SEARCH KEY") mem_attention_pre = tt.batched_dot(search_key, memory_encoded_btf.dimshuffle(0,2,1)) mem_attention_pre = decorate(mem_attention_pre,"ATT presoftmax") # use masking on attention, this might be unnecessary but we do it just to be sure mem_attention_pre_masked_bt = tt.mul(mem_attention_pre,context_mask) mem_attention_pre_masked_bt = decorate(mem_attention_pre_masked_bt,"ATT presoftmax masked") #mem_attention_bt = Softmax(name="memory_query_softmax").apply(mem_attention_pre_masked_bt) mem_attention_bt = SoftmaxWithMask(name="memory_query_softmax").apply(mem_attention_pre_masked_bt,context_mask) mem_attention_bt = decorate(mem_attention_bt,"ATT",level=2) # compute weighted attention over original word vectors att_weighted_responses_bf = theano.tensor.batched_dot(mem_attention_bt, context_embedding_tbf.dimshuffle(1,0,2)) #use mask to remove the probability mass from the unmasked candidates #word_probs_bi = word_probs_bi * candidates_bi_mask # compare desired response to all candidate responses # select relevant candidate answer words candidates_embeddings_bfi = lookup.apply(candidates_bi).dimshuffle(0,2,1) # convert it to output symbol probabilities y_hat_presoft = tt.batched_dot(att_weighted_responses_bf, candidates_embeddings_bfi) y_hat = SoftmaxWithMask(name="output_softmax").apply(y_hat_presoft,candidates_bi_mask) y_hat.name = "y_hat" y_hat = decorate(y_hat,"y_hat",level=2) # the correct answer is always the first among the candidates, so we can use zeros as index of ground truth y = y.zeros_like() # cost associated with prediction error cost_prediction = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost_prediction.name = "cost_prediction" cost = cost_prediction attention_cost_weight = None cost_attention = None cost.name = "cost" predicted_response_index = tt.argmax(y_hat,axis=1) accuracy = tt.eq(y,predicted_response_index).mean() accuracy.name = "accuracy" return cost, accuracy, mem_attention_bt, y_hat, attention_cost_weight, cost_prediction, cost_attention, context, candidates_bi, candidates_bi_mask, y, context_mask, x, x_mask
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') better = tensor.imatrix('better') better_mask = tensor.imatrix('better_mask') worse = tensor.imatrix('worse') worse_mask = tensor.imatrix('worse_mask') b_left = tensor.imatrix('b_left') b_left_mask = tensor.imatrix('b_left_mask') b_right = tensor.imatrix('b_right') b_right_mask = tensor.imatrix('b_right_mask') w_left = tensor.imatrix('w_left') w_left_mask = tensor.imatrix('w_left_mask') w_right = tensor.imatrix('w_right') w_right_mask = tensor.imatrix('w_right_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) better = better.dimshuffle(1, 0) better_mask = better_mask.dimshuffle(1, 0) worse = worse.dimshuffle(1, 0) worse_mask = worse_mask.dimshuffle(1, 0) b_left = b_left.dimshuffle(1, 0) b_left_mask = b_left_mask.dimshuffle(1, 0) b_right = b_right.dimshuffle(1, 0) b_right_mask = b_right_mask.dimshuffle(1, 0) w_left = w_left.dimshuffle(1, 0) w_left_mask = w_left_mask.dimshuffle(1, 0) w_right = w_right.dimshuffle(1, 0) w_right_mask = w_right_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # candidate encoders candidates_hidden_list = [] candidate_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_fwd_lstm_in_0_0') candidate_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_fwd_lstm_0') candidate_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_bwd_lstm_in_0_0') candidate_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [candidate_fwd_lstm, candidate_bwd_lstm, candidate_fwd_lstm_ins, candidate_bwd_lstm_ins] #computing better encoding better_embed = embed.apply(better) better_fwd_tmp = candidate_fwd_lstm_ins.apply(better_embed) better_bwd_tmp = candidate_bwd_lstm_ins.apply(better_embed) better_fwd_hidden, _ = candidate_fwd_lstm.apply(better_fwd_tmp, mask=better_mask.astype(theano.config.floatX)) better_bwd_hidden, _ = candidate_bwd_lstm.apply(better_bwd_tmp[::-1], mask=better_mask.astype(theano.config.floatX)[::-1]) better_hidden_list = [better_fwd_hidden, better_bwd_hidden] better_enc_dim = 2*sum(config.ctx_lstm_size) better_enc = tensor.concatenate([h[-1,:,:] for h in better_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_enc.name = 'better_enc' candidates_hidden_list = candidates_hidden_list + [better_fwd_hidden, better_bwd_hidden] #computing worse encoding worse_embed = embed.apply(worse) worse_fwd_tmp = candidate_fwd_lstm_ins.apply(worse_embed) worse_bwd_tmp = candidate_bwd_lstm_ins.apply(worse_embed) worse_fwd_hidden, _ = candidate_fwd_lstm.apply(worse_fwd_tmp, mask=worse_mask.astype(theano.config.floatX)) worse_bwd_hidden, _ = candidate_bwd_lstm.apply(worse_bwd_tmp[::-1], mask=worse_mask.astype(theano.config.floatX)[::-1]) worse_hidden_list = [worse_fwd_hidden, worse_bwd_hidden] worse_enc_dim = 2*sum(config.ctx_lstm_size) worse_enc = tensor.concatenate([h[-1,:,:] for h in worse_hidden_list], axis=1) worse_enc.name = 'worse_enc' candidates_hidden_list = candidates_hidden_list + [worse_fwd_hidden, worse_bwd_hidden] #left encoders left_context_hidden_list = [] left_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_fwd_lstm_in_0_0') left_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_fwd_lstm_0') left_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_bwd_lstm_in_0_0') left_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [left_context_fwd_lstm, left_context_bwd_lstm, left_context_fwd_lstm_ins, left_context_bwd_lstm_ins] #right encoders right_context_hidden_list = [] right_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_fwd_lstm_in_0_0') right_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_fwd_lstm_0') right_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_bwd_lstm_in_0_0') right_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [right_context_fwd_lstm, right_context_bwd_lstm, right_context_fwd_lstm_ins, right_context_bwd_lstm_ins] #left half encodings better_left_embed = embed.apply(b_left) better_left_fwd_tmp = left_context_fwd_lstm_ins.apply(better_left_embed) better_left_bwd_tmp = left_context_bwd_lstm_ins.apply(better_left_embed) better_left_fwd_hidden, _ = left_context_fwd_lstm.apply(better_left_fwd_tmp, mask=b_left_mask.astype(theano.config.floatX)) better_left_bwd_hidden, _ = left_context_bwd_lstm.apply(better_left_bwd_tmp[::-1], mask=b_left_mask.astype(theano.config.floatX)[::-1]) better_left_hidden_list = [better_left_fwd_hidden, better_left_bwd_hidden] better_left_enc_dim = 2*sum(config.ctx_lstm_size) better_left_enc = tensor.concatenate([h[-1,:,:] for h in better_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_left_enc.name = 'better_left_enc' left_context_hidden_list = left_context_hidden_list + [better_left_fwd_hidden, better_left_bwd_hidden] worse_left_embed = embed.apply(w_left) worse_left_fwd_tmp = left_context_fwd_lstm_ins.apply(worse_left_embed) worse_left_bwd_tmp = left_context_bwd_lstm_ins.apply(worse_left_embed) worse_left_fwd_hidden, _ = left_context_fwd_lstm.apply(worse_left_fwd_tmp, mask=w_left_mask.astype(theano.config.floatX)) worse_left_bwd_hidden, _ = left_context_bwd_lstm.apply(worse_left_bwd_tmp[::-1], mask=w_left_mask.astype(theano.config.floatX)[::-1]) worse_left_hidden_list = [worse_left_fwd_hidden, worse_left_bwd_hidden] worse_left_enc_dim = 2*sum(config.ctx_lstm_size) worse_left_enc = tensor.concatenate([h[-1,:,:] for h in worse_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_left_enc.name = 'worse_left_enc' left_context_hidden_list = left_context_hidden_list + [worse_left_fwd_hidden, worse_left_bwd_hidden] #right half encoding better_right_embed = embed.apply(b_right) better_right_fwd_tmp = right_context_fwd_lstm_ins.apply(better_right_embed) better_right_bwd_tmp = right_context_bwd_lstm_ins.apply(better_right_embed) better_right_fwd_hidden, _ = right_context_fwd_lstm.apply(better_right_fwd_tmp, mask=b_right_mask.astype(theano.config.floatX)) better_right_bwd_hidden, _ = right_context_bwd_lstm.apply(better_right_bwd_tmp[::-1], mask=b_right_mask.astype(theano.config.floatX)[::-1]) better_right_hidden_list = [better_right_fwd_hidden, better_right_bwd_hidden] better_right_enc_dim = 2*sum(config.ctx_lstm_size) better_right_enc = tensor.concatenate([h[-1,:,:] for h in better_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_right_enc.name = 'better_right_enc' right_context_hidden_list = right_context_hidden_list + [better_right_fwd_hidden, better_right_bwd_hidden] worse_right_embed = embed.apply(w_right) worse_right_fwd_tmp = right_context_fwd_lstm_ins.apply(worse_right_embed) worse_right_bwd_tmp = right_context_bwd_lstm_ins.apply(worse_right_embed) worse_right_fwd_hidden, _ = right_context_fwd_lstm.apply(worse_right_fwd_tmp, mask=w_right_mask.astype(theano.config.floatX)) worse_right_bwd_hidden, _ = right_context_bwd_lstm.apply(worse_right_bwd_tmp[::-1], mask=w_right_mask.astype(theano.config.floatX)[::-1]) worse_right_hidden_list = [worse_right_fwd_hidden, worse_right_bwd_hidden] worse_right_enc_dim = 2*sum(config.ctx_lstm_size) worse_right_enc = tensor.concatenate([h[-1,:,:] for h in worse_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_right_enc.name = 'worse_right_enc' right_context_hidden_list = right_context_hidden_list + [worse_right_fwd_hidden, worse_right_bwd_hidden] # F1 prediction MLP prediction_mlp = MLP(dims=config.prediction_mlp_hidden + [1], activations=config.prediction_mlp_activations[1:] + [Identity()], name='prediction_mlp') prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, name='preq') prediction_cand_linear = Linear(input_dim=worse_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='precand') prediction_left_half_linear = Linear(input_dim=better_left_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preleft') prediction_right_half_linear = Linear(input_dim=better_right_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preright') bricks += [prediction_mlp, prediction_qlinear, prediction_cand_linear, prediction_left_half_linear, prediction_right_half_linear] better_layer1 = Tanh('tan1').apply(tensor.concatenate([prediction_cand_linear.apply(better_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(better_left_enc), prediction_right_half_linear.apply(better_right_enc)],axis=1)) better_layer1.name = 'better_layer1' worse_layer1 = Tanh('tan2').apply(tensor.concatenate([prediction_cand_linear.apply(worse_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(worse_left_enc), prediction_right_half_linear.apply(worse_right_enc)],axis=1)) worse_layer1.name = 'worse_layer1' better_pred_weights = Tanh('rec1').apply(prediction_mlp.apply(better_layer1)) #batch_size worse_pred_weights = Tanh('rec2').apply(prediction_mlp.apply(worse_layer1)) #batch_size # numpy.set_printoptions(edgeitems=500) # better_pred_weights = theano.printing.Print('better')(better_pred_weights) # worse_pred_weights = theano.printing.Print('better')(worse_pred_weights) # #cost : max(0,- score-better + score-worse + margin) margin = config.margin conditions = tensor.lt(better_pred_weights, worse_pred_weights + margin).astype(theano.config.floatX) self.predictions = conditions cost = (-better_pred_weights + worse_pred_weights + margin) * conditions cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + candidates_hidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
VOCAB_DIM = dataset.vocabulary_size print "vocab size:", VOCAB_DIM EMBEDDING_DIM = 100 Xs = tensor.imatrix("context") y = tensor.ivector('center') w1 = LookupTable(name="w1", length=VOCAB_DIM, dim=EMBEDDING_DIM) w2 = Linear(name='w2', input_dim=EMBEDDING_DIM, output_dim=VOCAB_DIM) hidden = tensor.mean(w1.apply(Xs), axis=1) y_hat = Softmax().apply(w2.apply(hidden)) w1.weights_init = w2.weights_init = IsotropicGaussian(0.01) w1.biases_init = w2.biases_init = Constant(0) w1.initialize() w2.initialize() cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1**2).sum() + 0.005 * (W2**2).sum() cost.name = "loss" # # the actual training of the model # main = MainLoop( data_stream=DataStream.default_stream(dataset,
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') context_bag = tensor.eq(context[:, :, None], tensor.arange(vocab_size)).sum(axis=1).clip( 0, 1) bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') # embed.weights_init = Constant(embeddings_initial_value) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' #embed size: 200, lstm_size = 256 #qenc: length * batch_size * (2*lstm_size) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate( [ cembed, tensor.extra_ops.repeat( qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2 ) #length * batch_size * (embed+2*lstm_size) this is what goes into encoder clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' #cenc: length * batch_size * (2*lstm_size) #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) self.theano_params = [] add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() # for p in tparams.values(): # add_role(p, WEIGHT) # self.theano_params.append(p) #n_steps = length , n_samples = batch_size n_steps = ans_indices.shape[0] n_samples = ans_indices.shape[1] preds, generations = ptr_network( tparams, cqembed, context_mask.astype(theano.config.floatX), ans_indices, ans_indices_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, cenc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, ans_indices, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 # probs += (1 - y_mask) # change unmasked position to 1, since log(1) = 0 probs += off # probs_printed = theano.printing.Print('this is probs')(probs) cost = -tensor.log(probs) cost *= ans_indices_mask cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
[attention.take_glimpses.outputs[0]], emitter=emitter, name="readout") generator = SequenceGenerator(readout=readout, attention=attention, transition=transition, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.001) generator.push_initialization_config() lookup.weights_init = IsotropicGaussian(0.01) lookup.biases_init = Constant(0.001) lookup.initialize() #generator.transition.weights_init = initialization.Identity(0.98) #generator.transition.biases_init = IsotropicGaussian(0.01,0.9) generator.transition.push_initialization_config() generator.initialize() cost_matrix = generator.cost_matrix(x, x_mask, attended=embed, attended_mask=context_mask) cost = cost_matrix.sum(axis=0).mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost)
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP fwd attention_mlp_fwd = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_fwd') attention_qlinear_fwd = Linear( input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_fwd') attention_clinear_fwd = Linear( input_dim=cenc_dim / 2, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_fwd') bricks += [ attention_mlp_fwd, attention_qlinear_fwd, attention_clinear_fwd ] layer1_fwd = Tanh(name='tanh_fwd') layer1_fwd = layer1_fwd.apply( attention_clinear_fwd.apply(cenc[:, :, :cenc_dim / 2].reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2] / 2))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_fwd.apply(qenc)[None, :, :]) att_weights_fwd = attention_mlp_fwd.apply( layer1_fwd.reshape((layer1_fwd.shape[0] * layer1_fwd.shape[1], layer1_fwd.shape[2]))) att_weights_fwd = att_weights_fwd.reshape( (layer1_fwd.shape[0], layer1_fwd.shape[1])) att_weights_fwd = tensor.nnet.softmax(att_weights_fwd.T) att_weights_fwd.name = 'att_weights_fwd' attended_fwd = tensor.sum(cenc[:, :, :cenc_dim / 2] * att_weights_fwd.T[:, :, None], axis=0) attended_fwd.name = 'attended_fwd' # Attention mechanism MLP bwd attention_mlp_bwd = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_bwd') attention_qlinear_bwd = Linear( input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_bwd') attention_clinear_bwd = Linear( input_dim=cenc_dim / 2, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_bwd') bricks += [ attention_mlp_bwd, attention_qlinear_bwd, attention_clinear_bwd ] layer1_bwd = Tanh(name='tanh_bwd') layer1_bwd = layer1_bwd.apply( attention_clinear_bwd.apply(cenc[:, :, cenc_dim / 2:].reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2] / 2))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_bwd.apply(qenc)[None, :, :]) att_weights_bwd = attention_mlp_bwd.apply( layer1_bwd.reshape((layer1_bwd.shape[0] * layer1_bwd.shape[1], layer1_bwd.shape[2]))) att_weights_bwd = att_weights_bwd.reshape( (layer1_bwd.shape[0], layer1_bwd.shape[1])) att_weights_bwd = tensor.nnet.softmax(att_weights_bwd.T) att_weights_bwd.name = 'att_weights_bwd' attended_bwd = tensor.sum(cenc[:, :, cenc_dim / 2:] * att_weights_bwd.T[:, :, None], axis=0) attended_bwd.name = 'attended_bwd' ctx_question = tensor.concatenate([attended_fwd, attended_bwd, qenc], axis=1) ctx_question.name = 'ctx_question' answer_bag = to_bag(answer, vocab_size) answer_bag = tensor.set_subtensor(answer_bag[:, 0:3], 0) relevant_items = answer_bag.sum(axis=1, dtype=theano.config.floatX) def createSequences(j, index, c_enc, c_enc_dim, c_context, c_window_size): sequence = tensor.concatenate([ c_context[j:j + index, :], tensor.zeros((c_window_size - index, c_context.shape[1])) ], axis=0) enc = tensor.concatenate([ c_enc[j + index - 1, :, :], c_enc[j, :, :-1], tensor.tile(c_window_size[None, None], (c_enc.shape[1], 1)) ], axis=1) return enc, sequence def createTargetValues(j, index, c_context, c_vocab_size): sequence_bag = to_bag(c_context[j:j + index, :], c_vocab_size) sequence_bag = tensor.set_subtensor(sequence_bag[:, 0:3], 0) selected_items = sequence_bag.sum(axis=1, dtype=theano.config.floatX) tp = (sequence_bag * answer_bag).sum(axis=1, dtype=theano.config.floatX) precision = tp / (selected_items + 0.00001) recall = tp / (relevant_items + 0.00001) #precision = tensor.set_subtensor(precision[tensor.isnan(precision)], 0.0) #recall = tensor.set_subtensor(recall[tensor.isnan(recall)], 1.0) macroF1 = (2 * (precision * recall)) / (precision + recall + 0.00001) #macroF1 = tensor.set_subtensor(macroF1[tensor.isnan(macroF1)], 0.0) return macroF1 window_size = 3 senc = [] sequences = [] pred_targets = [] for i in range(1, window_size + 1): (all_enc, all_sequence), _ = theano.scan( fn=createSequences, sequences=tensor.arange(cenc.shape[0] - i + 1), non_sequences=[i, cenc, cenc_dim, context, window_size]) (all_macroF1), _ = theano.scan( fn=createTargetValues, sequences=tensor.arange(cenc.shape[0] - i + 1), non_sequences=[i, context, vocab_size]) senc.append(all_enc) sequences.append(all_sequence) pred_targets.append(all_macroF1) senc = tensor.concatenate(senc, axis=0) sequences = tensor.concatenate(sequences, axis=0) pred_targets = tensor.concatenate(pred_targets, axis=0) # F1 prediction Bilinear prediction_linear = Linear(input_dim=2 * cenc_dim, output_dim=cenc_dim + qenc_dim, name='pred_linear') bricks += [prediction_linear] pred_weights = ctx_question[None, :, :] * prediction_linear.apply( senc.reshape( (senc.shape[0] * senc.shape[1], senc.shape[2]))).reshape( (senc.shape[0], senc.shape[1], senc.shape[2])) pred_weights = pred_weights.sum(axis=2) pred_weights = tensor.nnet.sigmoid(pred_weights.T).T pred_weights.name = 'pred_weights' pred_targets = pred_targets / (pred_targets.sum(axis=0) + 0.00001) pred_weights = pred_weights / (pred_weights.sum(axis=0) + 0.00001) #numpy.set_printoptions(edgeitems=500) #pred_targets = theano.printing.Print('pred_targets')(pred_targets) #pred_weights = theano.printing.Print('pred_weights')(pred_weights) cost = tensor.nnet.binary_crossentropy(pred_weights, pred_targets).mean() self.predictions = sequences[pred_weights.argmax(axis=0), :, tensor.arange(sequences.shape[2])].T # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def main(): x = T.imatrix('features') m = T.matrix('features_mask') y = T.imatrix('targets') #x_int = x.astype(dtype='int32').T x_int = x.T train_dataset = IMDB('train') n_voc = len(train_dataset.dict.keys()) n_h = 2 lookup = LookupTable( length=n_voc+2, dim = n_h*4, weights_init = Uniform(std=0.01), biases_init = Constant(0.) ) lookup.initialize() #rnn = SimpleRecurrent( #dim = n_h, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) rnn = LSTM( dim = n_h, activation=Tanh(), weights_init = Uniform(std=0.01), biases_init = Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim = n_h, output_dim = 1, weights_init = Uniform(std=0.01), biases_init = Constant(0.)) score_layer.initialize() embedding = lookup.apply(x_int) * T.shape_padright(m.T) #embedding = lookup.apply(x_int) + m.T.mean()*0 #embedding = lookup.apply(x_int) + m.T.mean()*0 rnn_states = rnn.apply(embedding, mask=m.T) #rnn_states, rnn_cells = rnn.apply(embedding) rnn_out_mean_pooled = rnn_states[-1] #rnn_out_mean_pooled = rnn_states.mean() probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cost, params=params, step_rule = CompositeRule([ StepClipping(threshold=10), Adam(), #AdaDelta(), ]) ) # ======== test_dataset = IMDB('test') batch_size = 64 n_train = train_dataset.num_examples train_stream = DataStream( dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) train_padded = Padding( data_stream=train_stream, mask_sources=('features',) #mask_sources=[] ) test_stream = DataStream( dataset=test_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) test_padded = Padding( data_stream=test_stream, mask_sources=('features',) #mask_sources=[] ) #import ipdb #ipdb.set_trace() #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True )) extensions.append(DataStreamMonitoring( [cost, misclassification], data_stream=test_padded, prefix='test', after_epoch=True )) extensions.append(Timing()) extensions.append(Printing()) main_loop = MainLoop( model=model, data_stream=train_padded, algorithm=algorithm, extensions=extensions) main_loop.run()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply( attention_clinear.apply( cenc.reshape((cenc.shape[0] * cenc.shape[1], cenc.shape[2] ))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply( layer1.reshape( (layer1.shape[0] * layer1.shape[1], layer1.shape[2]))) att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights = tensor.nnet.sigmoid(att_weights.T).T att_weights.name = 'att_weights' att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() self.predictions = tensor.gt(att_weights, 0.1) * context # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
hidden_layer_size = 200 layer1 = LookupTable(name='layer1', length=train_dataset.words_bag_size, dim=hidden_layer_size, weights_init=Uniform(mean=0, std=0.01), biases_init=Constant(0)) act1_mean = tensor.mean(layer1.apply(x), axis=1) layer2 = Linear(name='layer2', input_dim=layer1.output_dim, output_dim=train_dataset.words_bag_size, weights_init=Uniform(mean=0, std=0.01), biases_init=Constant(0)) act2_softmax = Softmax().apply(layer2.apply(act1_mean)) layer1.initialize() layer2.initialize() missclass = MisclassificationRate().apply(y, act2_softmax) cost = CategoricalCrossEntropy().apply(y, act2_softmax) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.00001 * (W1**2).sum() + 0.00005 * (W2**2).sum() cost.name = 'cost' from blocks.algorithms import GradientDescent, Scale
VOCAB_DIM = dataset.vocabulary_size print "vocab size:", VOCAB_DIM EMBEDDING_DIM = 100 Xs = tensor.imatrix("context") y = tensor.ivector('center') w1 = LookupTable(name="w1", length=VOCAB_DIM, dim=EMBEDDING_DIM) w2 = Linear(name='w2', input_dim=EMBEDDING_DIM, output_dim=VOCAB_DIM) hidden = tensor.mean(w1.apply(Xs), axis=1) y_hat = Softmax().apply(w2.apply(hidden)) w1.weights_init = w2.weights_init = IsotropicGaussian(0.01) w1.biases_init = w2.biases_init = Constant(0) w1.initialize() w2.initialize() cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = "loss" # # the actual training of the model # main = MainLoop(data_stream = DataStream.default_stream(
def __init__(self, config, vocab_size): unsorted = tensor.imatrix('unsorted') unsorted_mask = tensor.imatrix('unsorted_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] unsorted = unsorted.dimshuffle(1, 0) unsorted_mask = unsorted_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed unsorted sequence embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) #make_bidir_lstm_stack(seq, seq_dim, mask, sizes, skip=True, name=''): unsorted_embed = embed.apply(unsorted) unsorted_lstms, unsorted_hidden_list = make_bidir_lstm_stack( unsorted_embed, config.embed_size, unsorted_mask.astype(theano.config.floatX), config.lstm_size, config.match_skip_connections, 'u') #lu,bs,lstm_dim bricks = bricks + unsorted_lstms unsorted_enc_dim = 2 * sum(config.lstm_size) unsorted_enc = tensor.concatenate( unsorted_hidden_list, axis=2) #concatenate fwd & bwd lstm hidden states unsorted_enc.name = 'unsorted_enc' #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_b1'], BIAS) add_role(tparams['ptr_b2'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() #n_steps = length , n_samples = batch_size n_steps = answer.shape[0] n_samples = answer.shape[1] preds, generations = ptr_network( tparams, unsorted_embed, unsorted_mask.astype(theano.config.floatX), answer, answer_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, unsorted_enc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, answer, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 probs += off # probs_printed = theano.printing.Print('probs')(probs) cost = -tensor.log(probs) cost *= answer_mask cost = cost.sum(axis=0) / answer_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, unsorted_hidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.0) generator.push_initialization_config() generator.transition.biases_init = IsotropicGaussian(0.01, 1) generator.transition.push_initialization_config() generator.initialize() lookup.weights_init = IsotropicGaussian(0.001) lookup.biases_init = Constant(0.0) lookup.initialize() # states = {} states = [state for state in generator.transition.apply.outputs if state != "step"] # ipdb.set_trace() states = {name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states} cost_matrix = generator.cost_matrix(x, attended=context, **states) cost = cost_matrix.mean() + 0.0 * start_flag cost.name = "nll" cg = ComputationGraph(cost)
def main(): x = T.imatrix('features') m = T.matrix('features_mask') y = T.imatrix('targets') #x_int = x.astype(dtype='int32').T x_int = x.T train_dataset = IMDB('train') n_voc = len(train_dataset.dict.keys()) n_h = 2 lookup = LookupTable(length=n_voc + 2, dim=n_h * 4, weights_init=Uniform(std=0.01), biases_init=Constant(0.)) lookup.initialize() #rnn = SimpleRecurrent( #dim = n_h, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) rnn = LSTM(dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.)) rnn.initialize() score_layer = Linear(input_dim=n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.)) score_layer.initialize() embedding = lookup.apply(x_int) * T.shape_padright(m.T) #embedding = lookup.apply(x_int) + m.T.mean()*0 #embedding = lookup.apply(x_int) + m.T.mean()*0 rnn_states = rnn.apply(embedding, mask=m.T) #rnn_states, rnn_cells = rnn.apply(embedding) rnn_out_mean_pooled = rnn_states[-1] #rnn_out_mean_pooled = rnn_states.mean() probs = Sigmoid().apply(score_layer.apply(rnn_out_mean_pooled)) cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(threshold=10), Adam(), #AdaDelta(), ])) # ======== test_dataset = IMDB('test') batch_size = 64 n_train = train_dataset.num_examples train_stream = DataStream(dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size)) train_padded = Padding(data_stream=train_stream, mask_sources=('features', ) #mask_sources=[] ) test_stream = DataStream(dataset=test_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size)) test_padded = Padding(data_stream=test_stream, mask_sources=('features', ) #mask_sources=[] ) #import ipdb #ipdb.set_trace() #====== model = Model(cost) extensions = [] extensions.append( EpochProgress( batch_per_epoch=train_dataset.num_examples // batch_size + 1)) extensions.append( TrainingDataMonitoring([cost, misclassification], prefix='train', after_epoch=True)) extensions.append( DataStreamMonitoring([cost, misclassification], data_stream=test_padded, prefix='test', after_epoch=True)) extensions.append(Timing()) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_padded, algorithm=algorithm, extensions=extensions) main_loop.run()