def __init__(self, input, n_hidden=500, grad_clip=100., only_return_final=True): self.input = input gate_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=initialize_parameters()[1]) cell_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=initialize_parameters()[1], nonlinearity=lasagne.nonlinearities.tanh) self.output = layers.LSTMLayer(self.input, n_hidden, ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, grad_clipping=grad_clip, only_return_final=only_return_final)
def ptb_lstm(input_var, vocabulary_size, hidden_size, seq_len, num_layers, dropout, batch_size): l_input = L.InputLayer(shape=(batch_size, seq_len), input_var=input_var) l_embed = L.EmbeddingLayer(l_input, vocabulary_size, hidden_size, W=init.Uniform(1.0)) l_lstms = [] for i in range(num_layers): l_lstm = L.LSTMLayer(l_embed if i == 0 else l_lstms[-1], hidden_size, ingate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal()), forgetgate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), b=init.Constant(1.0)), cell=L.Gate( W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal())) l_lstms.append(l_lstm) l_drop = L.DropoutLayer(l_lstms[-1], dropout) l_out = L.DenseLayer(l_drop, num_units=vocabulary_size, num_leading_axes=2) l_out = L.ReshapeLayer( l_out, (l_out.output_shape[0] * l_out.output_shape[1], l_out.output_shape[2])) l_out = L.NonlinearityLayer(l_out, nonlinearity=lasagne.nonlinearities.softmax) return l_out
def layer_LSTM(l_hid, hiddensize, nonlinearity, backwards=False, grad_clipping=50, name=""): ''' That's a custom LSTM layer that seems to converge faster. ''' ingate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0), W_hid=lasagne.init.Orthogonal(1.0)) forgetgate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0), W_hid=lasagne.init.Orthogonal(1.0)) outgate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0), W_hid=lasagne.init.Orthogonal(1.0)) cell = ll.Gate(W_cell=None, W_in=lasagne.init.Orthogonal(1.0), W_hid=lasagne.init.Orthogonal(1.0), nonlinearity=nonlinearity) # The final nonline should be TanH otherwise it doesn't converge (why?) # by default peepholes=True fwd = ll.LSTMLayer(l_hid, num_units=hiddensize, backwards=backwards, ingate=ingate, forgetgate=forgetgate, outgate=outgate, cell=cell, grad_clipping=grad_clipping, nonlinearity=lasagne.nonlinearities.tanh, name=name) return fwd
def integrate_captions(input_var=T.imatrix()): ''' :param batch_size: number of images :param nb_caption: number of caption used per image ''' ############################### # Build Network Configuration # ############################### print('... Integrating captions to the model') # Input of the network : shape = (nb_caption, seq_length) network = layers.InputLayer(shape=(None, None), input_var=input_var) # Embedding layer : shape = (nb_caption, seq_length, 400) vocab_length = get_vocab_length() network = layers.EmbeddingLayer(network, vocab_length, output_size=400) # LSTM layer : shape = (nb_caption, 500) gate_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.)) cell_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh) network = layers.LSTMLayer(network, num_units=500, ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, grad_clipping=100., only_return_final=True) # Dense Layer : shape = (nb_caption, 500) network = layers.DenseLayer(network, num_units=500) # Reshape layer : shape = (nb_caption, 500, 1, 1) network = layers.ReshapeLayer(network, (-1, 500, 1, 1)) return network
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): print ("==> not used params in DMN class:", kwargs.keys()) self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.dropout = dropout self.l2 = l2 self.mode = mode self.batch_norm = batch_norm self.num_units = rnn_num_units self.input_var = T.tensor4('input_var') self.answer_var = T.ivector('answer_var') print ("==> building network") example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) print (layers.get_output(network).eval({self.input_var:example}).shape) # CONV-RELU-POOL 1 network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 2 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 3 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 4 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) self.params = layers.get_all_params(network, trainable=True) output = layers.get_output(network) num_channels = 32 filter_W = 54 filter_H = 8 # NOTE: these constants are shapes of last pool layer, it can be symbolic # explicit values are better for optimizations channels = [] for channel_index in range(num_channels): channels.append(output[:, channel_index, :, :].transpose((0, 2, 1))) rnn_network_outputs = [] W_in_to_updategate = None W_hid_to_updategate = None b_updategate = None W_in_to_resetgate = None W_hid_to_resetgate = None b_resetgate = None W_in_to_hidden_update = None W_hid_to_hidden_update = None b_hidden_update = None W_in_to_updategate1 = None W_hid_to_updategate1 = None b_updategate1 = None W_in_to_resetgate1 = None W_hid_to_resetgate1 = None b_resetgate1 = None W_in_to_hidden_update1 = None W_hid_to_hidden_update1 = None b_hidden_update1 = None for channel_index in range(num_channels): rnn_input_var = channels[channel_index] # InputLayer network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var) if (channel_index == 0): # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False) W_in_to_updategate = network.W_in_to_updategate W_hid_to_updategate = network.W_hid_to_updategate b_updategate = network.b_updategate W_in_to_resetgate = network.W_in_to_resetgate W_hid_to_resetgate = network.W_hid_to_resetgate b_resetgate = network.b_resetgate W_in_to_hidden_update = network.W_in_to_hidden_update W_hid_to_hidden_update = network.W_hid_to_hidden_update b_hidden_update = network.b_hidden_update # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) W_in_to_updategate1 = network.W_in_to_updategate W_hid_to_updategate1 = network.W_hid_to_updategate b_updategate1 = network.b_updategate W_in_to_resetgate1 = network.W_in_to_resetgate W_hid_to_resetgate1 = network.W_hid_to_resetgate b_resetgate1 = network.b_resetgate W_in_to_hidden_update1 = network.W_in_to_hidden_update W_hid_to_hidden_update1 = network.W_hid_to_hidden_update b_hidden_update1 = network.b_hidden_update # add params self.params += layers.get_all_params(network, trainable=True) else: # GRULayer, but shared network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False, resetgate=layers.Gate(W_in=W_in_to_resetgate, W_hid=W_hid_to_resetgate, b=b_resetgate), updategate=layers.Gate(W_in=W_in_to_updategate, W_hid=W_hid_to_updategate, b=b_updategate), hidden_update=layers.Gate(W_in=W_in_to_hidden_update, W_hid=W_hid_to_hidden_update, b=b_hidden_update)) # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # GRULayer, but shared network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True, resetgate=layers.Gate(W_in=W_in_to_resetgate1, W_hid=W_hid_to_resetgate1, b=b_resetgate1), updategate=layers.Gate(W_in=W_in_to_updategate1, W_hid=W_hid_to_updategate1, b=b_updategate1), hidden_update=layers.Gate(W_in=W_in_to_hidden_update1, W_hid=W_hid_to_hidden_update1, b=b_hidden_update1)) rnn_network_outputs.append(layers.get_output(network)) all_output_var = T.concatenate(rnn_network_outputs, axis=1) print (all_output_var.eval({self.input_var:example}).shape) # InputLayer network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var) # Dropout Layer if (self.dropout > 0): network = layers.dropout(network, self.dropout) # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print (layers.get_output(network).eval({self.input_var:example}).shape) self.params += layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) #print "==> param shapes", [x.eval().shape for x in self.params] self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) if self.mode == 'train': print ("==> compiling train_fn") self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print ("==> compiling test_fn") self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
def clone(src_net, dst_net, mask_input): """ Clones a lasagne neural network, keeping weights tied. For all layers of src_net in turn, starting at the first: 1. creates a copy of the layer, 2. reuses the original objects for weights and 3. appends the new layer to dst_net. InputLayers are ignored. Recurrent layers (LSTMLayer) are passed mask_input. """ logger.info("Net to be cloned:") for l in layers.get_all_layers(src_net): logger.info(" - {} ({}):".format(l.name, l)) logger.info("Starting to clone..") for l in layers.get_all_layers(src_net): logger.info("src_net[...]: {} ({}):".format(l.name, l)) if type(l) == layers.InputLayer: logger.info(' - skipping') continue if type(l) == layers.DenseLayer: dst_net = layers.DenseLayer( dst_net, num_units=l.num_units, W=l.W, b=l.b, nonlinearity=l.nonlinearity, name=l.name+'2', ) elif type(l) == layers.EmbeddingLayer: dst_net = layers.EmbeddingLayer( dst_net, l.input_size, l.output_size, W=l.W, name=l.name+'2', ) elif type(l) == layers.LSTMLayer: dst_net = layers.LSTMLayer( dst_net, l.num_units, ingate=layers.Gate( W_in=l.W_in_to_ingate, W_hid=l.W_hid_to_ingate, W_cell=l.W_cell_to_ingate, b=l.b_ingate, nonlinearity=l.nonlinearity_ingate ), forgetgate=layers.Gate( W_in=l.W_in_to_forgetgate, W_hid=l.W_hid_to_forgetgate, W_cell=l.W_cell_to_forgetgate, b=l.b_forgetgate, nonlinearity=l.nonlinearity_forgetgate ), cell=layers.Gate( W_in=l.W_in_to_cell, W_hid=l.W_hid_to_cell, W_cell=None, b=l.b_cell, nonlinearity=l.nonlinearity_cell ), outgate=layers.Gate( W_in=l.W_in_to_outgate, W_hid=l.W_hid_to_outgate, W_cell=l.W_cell_to_outgate, b=l.b_outgate, nonlinearity=l.nonlinearity_outgate ), nonlinearity=l.nonlinearity, cell_init=l.cell_init, hid_init=l.hid_init, backwards=l.backwards, learn_init=l.learn_init, peepholes=l.peepholes, gradient_steps=l.gradient_steps, grad_clipping=l.grad_clipping, unroll_scan=l.unroll_scan, precompute_input=l.precompute_input, # mask_input=l.mask_input, # AttributeError: 'LSTMLayer' object has no attribute 'mask_input' name=l.name+'2', mask_input=mask_input, ) elif type(l) == layers.SliceLayer: dst_net = layers.SliceLayer( dst_net, indices=l.slice, axis=l.axis, name=l.name+'2', ) else: raise ValueError("Unhandled layer: {}".format(l)) new_layer = layers.get_all_layers(dst_net)[-1] logger.info('dst_net[...]: {} ({})'.format(new_layer, new_layer.name)) logger.info("Result of cloning:") for l in layers.get_all_layers(dst_net): logger.info(" - {} ({}):".format(l.name, l)) return dst_net
def build_network(self, vocab_size, input_var, mask_var, docidx_var, docidx_mask, skip_connect=True): l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var) l_mask = L.InputLayer(shape=(None, None), input_var=mask_var) l_embed = L.EmbeddingLayer(l_in, input_size=vocab_size, output_size=EMBED_DIM, W=self.params['W_emb']) l_embed_noise = L.dropout(l_embed, p=DROPOUT_RATE) # NOTE: Moved initialization of forget gate biases to init_params #forget_gate_1 = L.Gate(b=lasagne.init.Constant(3)) #forget_gate_2 = L.Gate(b=lasagne.init.Constant(3)) # NOTE: LSTM layer provided by Lasagne is slightly different from that used in DeepMind's paper. # In the paper the cell-to-* weights are not diagonal. # the 1st lstm layer in_gate = L.Gate(W_in=self.params['W_lstm1_xi'], W_hid=self.params['W_lstm1_hi'], W_cell=self.params['W_lstm1_ci'], b=self.params['b_lstm1_i'], nonlinearity=lasagne.nonlinearities.sigmoid) forget_gate = L.Gate(W_in=self.params['W_lstm1_xf'], W_hid=self.params['W_lstm1_hf'], W_cell=self.params['W_lstm1_cf'], b=self.params['b_lstm1_f'], nonlinearity=lasagne.nonlinearities.sigmoid) out_gate = L.Gate(W_in=self.params['W_lstm1_xo'], W_hid=self.params['W_lstm1_ho'], W_cell=self.params['W_lstm1_co'], b=self.params['b_lstm1_o'], nonlinearity=lasagne.nonlinearities.sigmoid) cell_gate = L.Gate(W_in=self.params['W_lstm1_xc'], W_hid=self.params['W_lstm1_hc'], W_cell=None, b=self.params['b_lstm1_c'], nonlinearity=lasagne.nonlinearities.tanh) l_fwd_1 = L.LSTMLayer(l_embed_noise, NUM_HIDDEN, ingate=in_gate, forgetgate=forget_gate, cell=cell_gate, outgate=out_gate, peepholes=True, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) # the 2nd lstm layer if skip_connect: # construct skip connection from the lookup table to the 2nd layer batch_size, seq_len, _ = input_var.shape # concatenate the last dimension of l_fwd_1 and embed l_fwd_1_shp = L.ReshapeLayer(l_fwd_1, (-1, NUM_HIDDEN)) l_embed_shp = L.ReshapeLayer(l_embed, (-1, EMBED_DIM)) to_next_layer = L.ReshapeLayer( L.concat([l_fwd_1_shp, l_embed_shp], axis=1), (batch_size, seq_len, NUM_HIDDEN + EMBED_DIM)) else: to_next_layer = l_fwd_1 to_next_layer_noise = L.dropout(to_next_layer, p=DROPOUT_RATE) in_gate = L.Gate(W_in=self.params['W_lstm2_xi'], W_hid=self.params['W_lstm2_hi'], W_cell=self.params['W_lstm2_ci'], b=self.params['b_lstm2_i'], nonlinearity=lasagne.nonlinearities.sigmoid) forget_gate = L.Gate(W_in=self.params['W_lstm2_xf'], W_hid=self.params['W_lstm2_hf'], W_cell=self.params['W_lstm2_cf'], b=self.params['b_lstm2_f'], nonlinearity=lasagne.nonlinearities.sigmoid) out_gate = L.Gate(W_in=self.params['W_lstm2_xo'], W_hid=self.params['W_lstm2_ho'], W_cell=self.params['W_lstm2_co'], b=self.params['b_lstm2_o'], nonlinearity=lasagne.nonlinearities.sigmoid) cell_gate = L.Gate(W_in=self.params['W_lstm2_xc'], W_hid=self.params['W_lstm2_hc'], W_cell=None, b=self.params['b_lstm2_c'], nonlinearity=lasagne.nonlinearities.tanh) l_fwd_2 = L.LSTMLayer(to_next_layer_noise, NUM_HIDDEN, ingate=in_gate, forgetgate=forget_gate, cell=cell_gate, outgate=out_gate, peepholes=True, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) # slice final states of both lstm layers l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1) l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1) # g will be used to score the words based on their embeddings g = L.DenseLayer(L.concat([l_fwd_1_slice, l_fwd_2_slice], axis=1), num_units=EMBED_DIM, W=self.params['W_dense'], b=self.params['b_dense'], nonlinearity=lasagne.nonlinearities.tanh) ## get outputs #g_out = L.get_output(g) # B x D #g_out_val = L.get_output(g, deterministic=True) # B x D ## compute softmax probs #probs,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm), # outputs_info=None, # sequences=[g_out,docidx_var,docidx_mask], # non_sequences=self.params['W_emb']) #predicted_probs = probs.reshape(docidx_var.shape) # B x N #probs_val,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm), # outputs_info=None, # sequences=[g_out_val,docidx_var,docidx_mask], # non_sequences=self.params['W_emb']) #predicted_probs_val = probs_val.reshape(docidx_var.shape) # B x N #return predicted_probs, predicted_probs_val # W is shared with the lookup table l_out = L.DenseLayer(g, num_units=vocab_size, W=self.params['W_emb'].T, nonlinearity=lasagne.nonlinearities.softmax, b=None) return l_out
def _build_net(self, emb_char_filter_size=5, emb_dropout=True, **kwargs): batch_size = self.mask_context_var.shape[0] context_len = self.mask_context_var.shape[1] question_len = self.question_var.shape[1] context_word_len = self.context_char_var.shape[2] question_word_len = self.question_char_var.shape[2] self.batch_size = batch_size self.context_len = context_len ''' Inputs and word embeddings''' l_context_char = LL.InputLayer(shape=(None, None, None), input_var=self.context_char_var) l_question_char = LL.InputLayer(shape=(None, None, None), input_var=self.question_char_var) l_c_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_context_var) l_q_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_question_var) l_c_char_mask = LL.InputLayer(shape=(None, None, None), input_var=self.mask_context_char_var) l_q_char_mask = LL.InputLayer(shape=(None, None, None), input_var=self.mask_question_char_var) l_c_emb = LL.InputLayer(shape=(None, None, self.emb_size), input_var=self.context_var) l_q_emb = LL.InputLayer(shape=(None, None, self.emb_size), input_var=self.question_var) if self.train_unk: l_c_unk_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_context_unk_var) l_q_unk_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_question_unk_var) l_c_emb = TrainUnkLayer(l_c_emb, l_c_unk_mask, output_size=self.emb_size, W=self.word_embeddings[0]) l_q_emb = TrainUnkLayer(l_q_emb, l_q_unk_mask, output_size=self.emb_size, W=l_c_emb.W) if self.negative: l_c_emb = TrainNAWLayer(l_c_emb, l_c_mask, output_size=self.emb_size) ''' Char-embeddings ''' # (batch_size x context_len x context_word_len x emb_char_size) l_c_char_emb = LL.EmbeddingLayer(l_context_char, input_size=self.alphabet_size, output_size=self.emb_char_size) l_q_char_emb = LL.EmbeddingLayer(l_question_char, input_size=self.alphabet_size, output_size=self.emb_char_size, W=l_c_char_emb.W) # here I do multiplication of character embeddings with masks, # because I want to pad them with constant zeros l_c_char_mask = ForgetSizeLayer( LL.dimshuffle(l_c_char_mask, (0, 1, 2, 'x'))) l_q_char_mask = ForgetSizeLayer( LL.dimshuffle(l_q_char_mask, (0, 1, 2, 'x'))) l_c_char_emb = LL.ElemwiseMergeLayer([l_c_char_emb, l_c_char_mask], T.mul) l_q_char_emb = LL.ElemwiseMergeLayer([l_q_char_emb, l_q_char_mask], T.mul) # convolutions l_c_char_emb = LL.dimshuffle( LL.reshape(l_c_char_emb, (batch_size * context_len, context_word_len, self.emb_char_size)), (0, 2, 1)) l_c_char_conv = LL.Conv1DLayer(l_c_char_emb, num_filters=self.num_emb_char_filters, filter_size=emb_char_filter_size, nonlinearity=L.nonlinearities.tanh, pad=self.conv) # (batch_size * context_len x num_filters x context_word_len + filter_size - 1) l_c_char_emb = LL.ExpressionLayer(l_c_char_conv, lambda X: X.max(2), output_shape='auto') l_c_char_emb = LL.reshape( l_c_char_emb, (batch_size, context_len, self.num_emb_char_filters)) l_q_char_emb = LL.dimshuffle( LL.reshape(l_q_char_emb, (batch_size * question_len, question_word_len, self.emb_char_size)), (0, 2, 1)) l_q_char_conv = LL.Conv1DLayer(l_q_char_emb, num_filters=self.num_emb_char_filters, filter_size=emb_char_filter_size, nonlinearity=L.nonlinearities.tanh, W=l_c_char_conv.W, b=l_c_char_conv.b, pad=self.conv) # (batch_size * question_len x num_filters x question_word_len + filter_size - 1) l_q_char_emb = LL.ExpressionLayer(l_q_char_conv, lambda X: X.max(2), output_shape='auto') l_q_char_emb = LL.reshape( l_q_char_emb, (batch_size, question_len, self.num_emb_char_filters)) ''' Concatenating both embeddings ''' l_c_emb = LL.concat([l_c_emb, l_c_char_emb], axis=2) l_q_emb = LL.concat([l_q_emb, l_q_char_emb], axis=2) # originally I had dropout here ''' Highway layer allowing for interaction between embeddings ''' l_c_P = LL.reshape(l_c_emb, (batch_size * context_len, self.emb_size + self.num_emb_char_filters)) l_c_P = LL.DenseLayer(l_c_P, num_units=self.rec_size, b=None, nonlinearity=None) l_c_high = HighwayLayer(l_c_P) l_c_emb = LL.reshape(l_c_high, (batch_size, context_len, self.rec_size)) l_q_P = LL.reshape(l_q_emb, (batch_size * question_len, self.emb_size + self.num_emb_char_filters)) l_q_P = LL.DenseLayer(l_q_P, num_units=self.rec_size, W=l_c_P.W, b=None, nonlinearity=None) l_q_high = HighwayLayer(l_q_P, W1=l_c_high.W1, b1=l_c_high.b1, W2=l_c_high.W2, b2=l_c_high.b2) l_q_emb = LL.reshape(l_q_high, (batch_size, question_len, self.rec_size)) ''' Calculating wiq features from https://arxiv.org/abs/1703.04816 ''' l_weighted_feat = WeightedFeatureLayer( [l_c_emb, l_q_emb, l_c_mask, l_q_mask]) # batch_size x context_len l_weighted_feat = LL.dimshuffle(l_weighted_feat, (0, 1, 'x')) # batch_size x context_len l_bin_feat = LL.InputLayer(shape=(None, None), input_var=self.bin_feat_var) l_bin_feat = LL.dimshuffle(l_bin_feat, (0, 1, 'x')) ''' Dropout at the embeddings ''' if emb_dropout: print('Using dropout after wiq calculation.') l_c_emb = LL.dropout(l_c_emb) l_q_emb = LL.dropout(l_q_emb) ''' Here we concatenate wiq features to embeddings''' # both features are concatenated to the embeddings # for the question we fix the features to 1 l_c_emb = LL.concat([l_c_emb, l_bin_feat, l_weighted_feat], axis=2) l_q_emb = LL.pad(l_q_emb, width=[(0, 2)], val=L.utils.floatX(1), batch_ndim=2) ''' Context and question encoding using the same BiLSTM for both ''' # output shape is (batch_size x context_len x rec_size) l_c_enc_forw = LL.LSTMLayer(l_c_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_c_mask) l_c_enc_back = LL.LSTMLayer(l_c_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_c_mask, backwards=True) # output shape is (batch_size x question_len x rec_size) l_q_enc_forw = LL.LSTMLayer( l_q_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_q_mask, ingate=LL.Gate(W_in=l_c_enc_forw.W_in_to_ingate, W_hid=l_c_enc_forw.W_hid_to_ingate, W_cell=l_c_enc_forw.W_cell_to_ingate, b=l_c_enc_forw.b_ingate), forgetgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_forgetgate, W_hid=l_c_enc_forw.W_hid_to_forgetgate, W_cell=l_c_enc_forw.W_cell_to_forgetgate, b=l_c_enc_forw.b_forgetgate), outgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_outgate, W_hid=l_c_enc_forw.W_hid_to_outgate, W_cell=l_c_enc_forw.W_cell_to_outgate, b=l_c_enc_forw.b_outgate), cell=LL.Gate(W_in=l_c_enc_forw.W_in_to_cell, W_hid=l_c_enc_forw.W_hid_to_cell, W_cell=None, b=l_c_enc_forw.b_cell, nonlinearity=L.nonlinearities.tanh)) l_q_enc_back = LL.LSTMLayer( l_q_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_q_mask, backwards=True, ingate=LL.Gate(W_in=l_c_enc_back.W_in_to_ingate, W_hid=l_c_enc_back.W_hid_to_ingate, W_cell=l_c_enc_back.W_cell_to_ingate, b=l_c_enc_back.b_ingate), forgetgate=LL.Gate(W_in=l_c_enc_back.W_in_to_forgetgate, W_hid=l_c_enc_back.W_hid_to_forgetgate, W_cell=l_c_enc_back.W_cell_to_forgetgate, b=l_c_enc_back.b_forgetgate), outgate=LL.Gate(W_in=l_c_enc_back.W_in_to_outgate, W_hid=l_c_enc_back.W_hid_to_outgate, W_cell=l_c_enc_back.W_cell_to_outgate, b=l_c_enc_back.b_outgate), cell=LL.Gate(W_in=l_c_enc_back.W_in_to_cell, W_hid=l_c_enc_back.W_hid_to_cell, W_cell=None, b=l_c_enc_back.b_cell, nonlinearity=L.nonlinearities.tanh)) # batch_size x context_len x 2*rec_size l_c_enc = LL.concat([l_c_enc_forw, l_c_enc_back], axis=2) # batch_size x question_len x 2*rec_size l_q_enc = LL.concat([l_q_enc_forw, l_q_enc_back], axis=2) def proj_init(): return np.vstack([ np.eye(self.rec_size, dtype=theano.config.floatX), np.eye(self.rec_size, dtype=theano.config.floatX) ]) # this is H from the paper, shape: (batch_size * context_len x # rec_size) l_c_proj = LL.reshape(l_c_enc, (batch_size * context_len, 2 * self.rec_size)) l_c_proj = LL.DenseLayer(l_c_proj, num_units=self.rec_size, W=proj_init(), b=None, nonlinearity=L.nonlinearities.tanh) # this is Z from the paper, shape: (batch_size * question_len x # rec_size) l_q_proj = LL.reshape(l_q_enc, (batch_size * question_len, 2 * self.rec_size)) l_q_proj = LL.DenseLayer(l_q_proj, num_units=self.rec_size, W=proj_init(), b=None, nonlinearity=L.nonlinearities.tanh) ''' Additional, weighted question encoding (alphas from paper) ''' l_alpha = LL.DenseLayer( l_q_proj, # batch_size * question_len x 1 num_units=1, b=None, nonlinearity=None) # batch_size x question_len l_alpha = MaskedSoftmaxLayer( LL.reshape(l_alpha, (batch_size, question_len)), l_q_mask) # batch_size x rec_size l_z_hat = BatchedDotLayer([ LL.reshape(l_q_proj, (batch_size, question_len, self.rec_size)), l_alpha ]) return l_c_proj, l_z_hat
def l1lstm_l2d(input_dim, output_dim, n_hidden, nonlinearity=lasagne.nonlinearities.tanh, layer_type=layers.LSTMLayer, learning_rate=1e-4, wl2=0., wl1=0, r_reg_coeff=0., grad_clipping=0., bidirectional=False, loss_type='MSE', skip_connection=False, **kwargs): # Specify the number of steps used before computing the gradient if 'gradient_steps' not in kwargs: gradient_steps = -1 else: gradient_steps = kwargs.pop('gradient_steps') target = T.tensor3() target.name = 'target' # Input Layer l_in = layers.InputLayer((None, None, input_dim)) input_layer = l_in if bidirectional: input_layer_b = l_in if skip_connection: # Input to output connection l_in_to_out = lasagne.layers.DenseLayer(lasagne.layers.ReshapeLayer( l_in, (-1, input_dim)), output_dim, nonlinearity=None, name='in_to_out') b_size, seqlen, _ = l_in.input_var.shape lstm_layers = (layers.LSTMLayer, c_layers.MILSTMLayer, c_layers.BatchNormLSTMLayer) gru_layers = (layers.GRULayer, c_layers.MIGRULayer) if layer_type in lstm_layers: print 'Using {0}'.format(layer_type) name = 'lstm' l_r_f = layer_type(incoming=input_layer, num_units=n_hidden, nonlinearity=nonlinearity, gradient_steps=gradient_steps, name=name, **kwargs) if bidirectional: print 'Using bidirectional network' l_r_b = layer_type(incoming=input_layer_b, num_units=n_hidden, nonlinearity=nonlinearity, gradient_steps=gradient_steps, name=name + '_b', backwards=True, **kwargs) elif layer_type is layers.GRULayer: print 'Using {0}'.format(layer_type) name = 'gru' l_r_f = layer_type( incoming=input_layer, num_units=n_hidden, hidden_update=layers.Gate(nonlinearity=nonlinearity), gradient_steps=gradient_steps, name=name, **kwargs) if bidirectional: print 'Using bidirectional network' l_r_b = layer_type( incoming=input_layer_b, num_units=n_hidden, hidden_update=layers.Gate(nonlinearity=nonlinearity), gradient_steps=gradient_steps, name=name + '_b', backwards=True, **kwargs) elif layer_type is c_layers.MIGRULayer: print 'Using {0}'.format(layer_type) name = 'gru' l_r_f = layer_type( incoming=input_layer, num_units=n_hidden, hidden_update=c_layers.MIGate(nonlinearity=nonlinearity), gradient_steps=gradient_steps, name=name, **kwargs) if bidirectional: print 'Using bidirectional network' l_r_b = layer_type( incoming=input_layer_b, num_units=n_hidden, hidden_update=c_layers.MIGate(nonlinearity=nonlinearity), gradient_steps=gradient_steps, name=name + '_b', backwards=True, **kwargs) else: print 'Invalid layer_type {0}'.format(layer_type) l_concat = l_r_f out_shape = n_hidden if bidirectional: print 'Concatenating Forward and Backward recurrent layers' l_concat = layers.ConcatLayer((l_concat, l_r_b), axis=-1) out_shape = out_shape + n_hidden l_re = layers.ReshapeLayer(l_concat, (-1, out_shape), name='reshape') if loss_type == 'MSE': print 'Using MSE' l_d = layers.DenseLayer(l_re, output_dim, nonlinearity=None, name='dense') if skip_connection: # Combine input_to_output and hidden to output layers l_output = lasagne.layers.ElemwiseSumLayer([l_in_to_out, l_d]) else: l_output = l_d if kwargs.get('only_return_final', False): out_shape = (b_size, 1, output_dim) else: out_shape = (b_size, seqlen, output_dim) l_out = layers.ReshapeLayer(l_output, out_shape) deterministic_out = layers.get_output(l_out, deterministic=True) deterministic_out.name = 'deterministic out' stochastic_out = layers.get_output(l_out) stochastic_out.name = 'stochastic out' params = layers.get_all_params(l_out, trainable=True) if layer_type in lstm_layers: # Get regularizable parameters of the LSTM reg_params_norm = [ l_r_f.W_in_to_cell, l_r_f.W_in_to_forgetgate, l_r_f.W_in_to_ingate, l_r_f.W_in_to_outgate ] reg_params_rec = [ l_r_f.W_hid_to_cell, l_r_f.W_hid_to_forgetgate, l_r_f.W_hid_to_ingate, l_r_f.W_hid_to_outgate ] if bidirectional: reg_params_norm += [ l_r_b.W_in_to_cell, l_r_b.W_in_to_forgetgate, l_r_b.W_in_to_ingate, l_r_b.W_in_to_outgate ] reg_params_rec += [ l_r_b.W_hid_to_cell, l_r_b.W_hid_to_forgetgate, l_r_b.W_hid_to_ingate, l_r_b.W_hid_to_outgate ] elif layer_type in gru_layers: # Get regularizable parameters of the GRU reg_params_norm = [ l_r_f.W_in_to_updategate, l_r_f.W_in_to_resetgate, l_r_f.W_in_to_hidden_update ] reg_params_rec = [ l_r_f.W_hid_to_updategate, l_r_f.W_hid_to_resetgate, l_r_f.W_hid_to_hidden_update ] if bidirectional: reg_params_norm += [ l_r_b.W_in_to_updategate, l_r_b.W_in_to_resetgate, l_r_b.W_in_to_hidden_update ] reg_params_rec += [ l_r_b.W_hid_to_updategate, l_r_b.W_hid_to_resetgate, l_r_b.W_hid_to_hidden_update ] if wl2 > 0: print 'Using L2 norm regularization' weight_reg = wl2 * (sum([T.mean(p**2) for p in reg_params_norm]) + T.mean(l_d.W**2)) if skip_connection: weight_reg += wl2 * T.mean(l_in_to_out.W**2) else: weight_reg = 0. if wl1 > 0: print 'Using L1 norm regularization' weight_reg += wl1 * (sum([T.mean(p**2) for p in reg_params_norm]) + T.mean(l_d.W)) if skip_connection: weight_reg += wl1 * T.mean(abs(l_in_to_out.W)) if r_reg_coeff > 0: print 'Using hid to hid eigenvalue regularization' weight_reg += r_reg_coeff * sum( [T.mean((T.nlinalg.eigh(p)[0] - 1.)**2) for p in reg_params_rec]) stochastic_loss = ( lasagne.objectives.squared_error(stochastic_out, target).mean() + weight_reg) stochastic_loss.name = 'stochastic MSE (regularized)' deterministic_loss = T.mean( lasagne.objectives.squared_error(deterministic_out, target)) deterministic_loss.name = 'MSE' updates = lasagne.updates.rmsprop(stochastic_loss, params, learning_rate=learning_rate) train_loss = [stochastic_loss] valid_loss = [deterministic_loss] return dict(l_in=l_in, l_out=l_out, train_loss=train_loss, valid_loss=valid_loss, target=target, updates=updates, predictions=deterministic_out, gradient_steps=gradient_steps, model_type='RNN')