Beispiel #1
0
    def __init__(self,
                 input,
                 n_hidden=500,
                 grad_clip=100.,
                 only_return_final=True):

        self.input = input

        gate_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(),
                                      W_hid=lasagne.init.Orthogonal(),
                                      b=initialize_parameters()[1])

        cell_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(),
                                      W_hid=lasagne.init.Orthogonal(),
                                      W_cell=None,
                                      b=initialize_parameters()[1],
                                      nonlinearity=lasagne.nonlinearities.tanh)

        self.output = layers.LSTMLayer(self.input,
                                       n_hidden,
                                       ingate=gate_parameters,
                                       forgetgate=gate_parameters,
                                       cell=cell_parameters,
                                       outgate=gate_parameters,
                                       grad_clipping=grad_clip,
                                       only_return_final=only_return_final)
def ptb_lstm(input_var, vocabulary_size, hidden_size, seq_len, num_layers,
             dropout, batch_size):
    l_input = L.InputLayer(shape=(batch_size, seq_len), input_var=input_var)
    l_embed = L.EmbeddingLayer(l_input,
                               vocabulary_size,
                               hidden_size,
                               W=init.Uniform(1.0))
    l_lstms = []
    for i in range(num_layers):
        l_lstm = L.LSTMLayer(l_embed if i == 0 else l_lstms[-1],
                             hidden_size,
                             ingate=L.Gate(W_in=init.GlorotUniform(),
                                           W_hid=init.Orthogonal()),
                             forgetgate=L.Gate(W_in=init.GlorotUniform(),
                                               W_hid=init.Orthogonal(),
                                               b=init.Constant(1.0)),
                             cell=L.Gate(
                                 W_in=init.GlorotUniform(),
                                 W_hid=init.Orthogonal(),
                                 W_cell=None,
                                 nonlinearity=lasagne.nonlinearities.tanh),
                             outgate=L.Gate(W_in=init.GlorotUniform(),
                                            W_hid=init.Orthogonal()))
        l_lstms.append(l_lstm)
    l_drop = L.DropoutLayer(l_lstms[-1], dropout)
    l_out = L.DenseLayer(l_drop, num_units=vocabulary_size, num_leading_axes=2)
    l_out = L.ReshapeLayer(
        l_out,
        (l_out.output_shape[0] * l_out.output_shape[1], l_out.output_shape[2]))
    l_out = L.NonlinearityLayer(l_out,
                                nonlinearity=lasagne.nonlinearities.softmax)
    return l_out
Beispiel #3
0
def layer_LSTM(l_hid,
               hiddensize,
               nonlinearity,
               backwards=False,
               grad_clipping=50,
               name=""):
    '''
    That's a custom LSTM layer that seems to converge faster.
    '''
    ingate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0),
                     W_hid=lasagne.init.Orthogonal(1.0))
    forgetgate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0),
                         W_hid=lasagne.init.Orthogonal(1.0))
    outgate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0),
                      W_hid=lasagne.init.Orthogonal(1.0))
    cell = ll.Gate(W_cell=None,
                   W_in=lasagne.init.Orthogonal(1.0),
                   W_hid=lasagne.init.Orthogonal(1.0),
                   nonlinearity=nonlinearity)
    # The final nonline should be TanH otherwise it doesn't converge (why?)
    # by default peepholes=True
    fwd = ll.LSTMLayer(l_hid,
                       num_units=hiddensize,
                       backwards=backwards,
                       ingate=ingate,
                       forgetgate=forgetgate,
                       outgate=outgate,
                       cell=cell,
                       grad_clipping=grad_clipping,
                       nonlinearity=lasagne.nonlinearities.tanh,
                       name=name)

    return fwd
Beispiel #4
0
def integrate_captions(input_var=T.imatrix()):
    '''
            :param batch_size: number of images
            :param nb_caption: number of caption used per image
    '''

    ###############################
    # Build Network Configuration #
    ###############################

    print('... Integrating captions to the model')

    # Input of the network : shape = (nb_caption, seq_length)
    network = layers.InputLayer(shape=(None, None), input_var=input_var)

    # Embedding layer : shape = (nb_caption, seq_length, 400)
    vocab_length = get_vocab_length()
    network = layers.EmbeddingLayer(network, vocab_length, output_size=400)

    # LSTM layer : shape = (nb_caption, 500)
    gate_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(),
                                  W_hid=lasagne.init.Orthogonal(),
                                  b=lasagne.init.Constant(0.))

    cell_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(),
                                  W_hid=lasagne.init.Orthogonal(),
                                  W_cell=None,
                                  b=lasagne.init.Constant(0.),
                                  nonlinearity=nonlinearities.tanh)

    network = layers.LSTMLayer(network,
                               num_units=500,
                               ingate=gate_parameters,
                               forgetgate=gate_parameters,
                               cell=cell_parameters,
                               outgate=gate_parameters,
                               grad_clipping=100.,
                               only_return_final=True)

    # Dense Layer : shape = (nb_caption, 500)
    network = layers.DenseLayer(network, num_units=500)

    # Reshape layer : shape = (nb_caption, 500, 1, 1)
    network = layers.ReshapeLayer(network, (-1, 500, 1, 1))

    return network
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs):
        
        print ("==> not used params in DMN class:", kwargs.keys())
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.dropout = dropout
        self.l2 = l2
        self.mode = mode
        self.batch_norm = batch_norm
        self.num_units = rnn_num_units
        
        self.input_var = T.tensor4('input_var')
        self.answer_var = T.ivector('answer_var')
        
        print ("==> building network")
        example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) #########
        answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) #########
       
        network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        
        # CONV-RELU-POOL 1
        network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 2
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        
        # CONV-RELU-POOL 3
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 4
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        self.params = layers.get_all_params(network, trainable=True)
        
        output = layers.get_output(network)
        num_channels  = 32 
        filter_W = 54
        filter_H = 8
        
        # NOTE: these constants are shapes of last pool layer, it can be symbolic 
        # explicit values are better for optimizations
        
        channels = []
        for channel_index in range(num_channels):
            channels.append(output[:, channel_index, :, :].transpose((0, 2, 1)))
        
        rnn_network_outputs = []
        W_in_to_updategate = None
        W_hid_to_updategate = None
        b_updategate = None
        W_in_to_resetgate = None
        W_hid_to_resetgate = None
        b_resetgate = None
        W_in_to_hidden_update = None
        W_hid_to_hidden_update = None
        b_hidden_update = None
        
        W_in_to_updategate1 = None
        W_hid_to_updategate1 = None
        b_updategate1 = None
        W_in_to_resetgate1 = None
        W_hid_to_resetgate1 = None
        b_resetgate1 = None
        W_in_to_hidden_update1 = None
        W_hid_to_hidden_update1 = None
        b_hidden_update1 = None
        
        for channel_index in range(num_channels):
            rnn_input_var = channels[channel_index]
            
            # InputLayer       
            network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var)

            if (channel_index == 0):
                # GRULayer
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False)
                W_in_to_updategate = network.W_in_to_updategate
                W_hid_to_updategate = network.W_hid_to_updategate
                b_updategate = network.b_updategate
                W_in_to_resetgate = network.W_in_to_resetgate
                W_hid_to_resetgate = network.W_hid_to_resetgate
                b_resetgate = network.b_resetgate
                W_in_to_hidden_update = network.W_in_to_hidden_update
                W_hid_to_hidden_update = network.W_hid_to_hidden_update
                b_hidden_update = network.b_hidden_update
                
                # BatchNormalization Layer
                if (self.batch_norm):
                    network = layers.BatchNormLayer(incoming=network)
                
                # GRULayer
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True)
                W_in_to_updategate1 = network.W_in_to_updategate
                W_hid_to_updategate1 = network.W_hid_to_updategate
                b_updategate1 = network.b_updategate
                W_in_to_resetgate1 = network.W_in_to_resetgate
                W_hid_to_resetgate1 = network.W_hid_to_resetgate
                b_resetgate1 = network.b_resetgate
                W_in_to_hidden_update1 = network.W_in_to_hidden_update
                W_hid_to_hidden_update1 = network.W_hid_to_hidden_update
                b_hidden_update1 = network.b_hidden_update
                        
                # add params 
                self.params += layers.get_all_params(network, trainable=True)

            else:
                # GRULayer, but shared
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False,
                            resetgate=layers.Gate(W_in=W_in_to_resetgate, W_hid=W_hid_to_resetgate, b=b_resetgate),
                            updategate=layers.Gate(W_in=W_in_to_updategate, W_hid=W_hid_to_updategate, b=b_updategate),
                            hidden_update=layers.Gate(W_in=W_in_to_hidden_update, W_hid=W_hid_to_hidden_update, b=b_hidden_update))
                            
                # BatchNormalization Layer
                if (self.batch_norm):
                    network = layers.BatchNormLayer(incoming=network)
                    
                # GRULayer, but shared
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True,
                            resetgate=layers.Gate(W_in=W_in_to_resetgate1, W_hid=W_hid_to_resetgate1, b=b_resetgate1),
                            updategate=layers.Gate(W_in=W_in_to_updategate1, W_hid=W_hid_to_updategate1, b=b_updategate1),
                            hidden_update=layers.Gate(W_in=W_in_to_hidden_update1, W_hid=W_hid_to_hidden_update1, b=b_hidden_update1))
                
            
            rnn_network_outputs.append(layers.get_output(network))
        
        all_output_var = T.concatenate(rnn_network_outputs, axis=1)
        print (all_output_var.eval({self.input_var:example}).shape)
        
        # InputLayer
        network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var)
        
        # Dropout Layer
        if (self.dropout > 0):
            network = layers.dropout(network, self.dropout)
        
        # BatchNormalization Layer
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # Last layer: classification
        network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        
    
        self.params += layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)
    
        #print "==> param shapes", [x.eval().shape for x in self.params]
        
        self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 
                                                                          lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2
        
        #updates = lasagne.updates.adadelta(self.loss, self.params)
        updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003)
        
        if self.mode == 'train':
            print ("==> compiling train_fn")
            self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 
                                            outputs=[self.prediction, self.loss],
                                            updates=updates)
        
        print ("==> compiling test_fn")
        self.test_fn = theano.function(inputs=[self.input_var, self.answer_var],
                                       outputs=[self.prediction, self.loss])
def clone(src_net, dst_net, mask_input):
    """
    Clones a lasagne neural network, keeping weights tied.

    For all layers of src_net in turn, starting at the first:
     1. creates a copy of the layer,
     2. reuses the original objects for weights and
     3. appends the new layer to dst_net.

    InputLayers are ignored.
    Recurrent layers (LSTMLayer) are passed mask_input.
    """
    logger.info("Net to be cloned:")
    for l in layers.get_all_layers(src_net):
        logger.info(" - {} ({}):".format(l.name, l))

    logger.info("Starting to clone..")
    for l in layers.get_all_layers(src_net):
        logger.info("src_net[...]: {} ({}):".format(l.name, l))
        if type(l) == layers.InputLayer:
            logger.info(' - skipping')
            continue
        if type(l) == layers.DenseLayer:
            dst_net = layers.DenseLayer(
                dst_net,
                num_units=l.num_units,
                W=l.W,
                b=l.b,
                nonlinearity=l.nonlinearity,
                name=l.name+'2',
            )
        elif type(l) == layers.EmbeddingLayer:
            dst_net = layers.EmbeddingLayer(
                dst_net,
                l.input_size,
                l.output_size,
                W=l.W,
                name=l.name+'2',
            )
        elif type(l) == layers.LSTMLayer:
            dst_net = layers.LSTMLayer(
                dst_net,
                l.num_units,
                ingate=layers.Gate(
                    W_in=l.W_in_to_ingate,
                    W_hid=l.W_hid_to_ingate,
                    W_cell=l.W_cell_to_ingate,
                    b=l.b_ingate,
                    nonlinearity=l.nonlinearity_ingate
                ),
                forgetgate=layers.Gate(
                    W_in=l.W_in_to_forgetgate,
                    W_hid=l.W_hid_to_forgetgate,
                    W_cell=l.W_cell_to_forgetgate,
                    b=l.b_forgetgate,
                    nonlinearity=l.nonlinearity_forgetgate
                ),
                cell=layers.Gate(
                    W_in=l.W_in_to_cell,
                    W_hid=l.W_hid_to_cell,
                    W_cell=None,
                    b=l.b_cell,
                    nonlinearity=l.nonlinearity_cell
                ),
                outgate=layers.Gate(
                    W_in=l.W_in_to_outgate,
                    W_hid=l.W_hid_to_outgate,
                    W_cell=l.W_cell_to_outgate,
                    b=l.b_outgate,
                    nonlinearity=l.nonlinearity_outgate
                ),
                nonlinearity=l.nonlinearity,
                cell_init=l.cell_init,
                hid_init=l.hid_init,
                backwards=l.backwards,
                learn_init=l.learn_init,
                peepholes=l.peepholes,
                gradient_steps=l.gradient_steps,
                grad_clipping=l.grad_clipping,
                unroll_scan=l.unroll_scan,
                precompute_input=l.precompute_input,
                # mask_input=l.mask_input, # AttributeError: 'LSTMLayer' object has no attribute 'mask_input'
                name=l.name+'2',
                mask_input=mask_input,
            )
        elif type(l) == layers.SliceLayer:
            dst_net = layers.SliceLayer(
                dst_net,
                indices=l.slice,
                axis=l.axis,
                name=l.name+'2',
            )
        else:
            raise ValueError("Unhandled layer: {}".format(l))
        new_layer = layers.get_all_layers(dst_net)[-1]
        logger.info('dst_net[...]: {} ({})'.format(new_layer, new_layer.name))

    logger.info("Result of cloning:")
    for l in layers.get_all_layers(dst_net):
        logger.info(" - {} ({}):".format(l.name, l))

    return dst_net
    def build_network(self,
                      vocab_size,
                      input_var,
                      mask_var,
                      docidx_var,
                      docidx_mask,
                      skip_connect=True):

        l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var)

        l_mask = L.InputLayer(shape=(None, None), input_var=mask_var)

        l_embed = L.EmbeddingLayer(l_in,
                                   input_size=vocab_size,
                                   output_size=EMBED_DIM,
                                   W=self.params['W_emb'])

        l_embed_noise = L.dropout(l_embed, p=DROPOUT_RATE)

        # NOTE: Moved initialization of forget gate biases to init_params
        #forget_gate_1 = L.Gate(b=lasagne.init.Constant(3))
        #forget_gate_2 = L.Gate(b=lasagne.init.Constant(3))

        # NOTE: LSTM layer provided by Lasagne is slightly different from that used in DeepMind's paper.
        # In the paper the cell-to-* weights are not diagonal.
        # the 1st lstm layer
        in_gate = L.Gate(W_in=self.params['W_lstm1_xi'],
                         W_hid=self.params['W_lstm1_hi'],
                         W_cell=self.params['W_lstm1_ci'],
                         b=self.params['b_lstm1_i'],
                         nonlinearity=lasagne.nonlinearities.sigmoid)
        forget_gate = L.Gate(W_in=self.params['W_lstm1_xf'],
                             W_hid=self.params['W_lstm1_hf'],
                             W_cell=self.params['W_lstm1_cf'],
                             b=self.params['b_lstm1_f'],
                             nonlinearity=lasagne.nonlinearities.sigmoid)
        out_gate = L.Gate(W_in=self.params['W_lstm1_xo'],
                          W_hid=self.params['W_lstm1_ho'],
                          W_cell=self.params['W_lstm1_co'],
                          b=self.params['b_lstm1_o'],
                          nonlinearity=lasagne.nonlinearities.sigmoid)
        cell_gate = L.Gate(W_in=self.params['W_lstm1_xc'],
                           W_hid=self.params['W_lstm1_hc'],
                           W_cell=None,
                           b=self.params['b_lstm1_c'],
                           nonlinearity=lasagne.nonlinearities.tanh)
        l_fwd_1 = L.LSTMLayer(l_embed_noise,
                              NUM_HIDDEN,
                              ingate=in_gate,
                              forgetgate=forget_gate,
                              cell=cell_gate,
                              outgate=out_gate,
                              peepholes=True,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True)

        # the 2nd lstm layer
        if skip_connect:
            # construct skip connection from the lookup table to the 2nd layer
            batch_size, seq_len, _ = input_var.shape
            # concatenate the last dimension of l_fwd_1 and embed
            l_fwd_1_shp = L.ReshapeLayer(l_fwd_1, (-1, NUM_HIDDEN))
            l_embed_shp = L.ReshapeLayer(l_embed, (-1, EMBED_DIM))
            to_next_layer = L.ReshapeLayer(
                L.concat([l_fwd_1_shp, l_embed_shp], axis=1),
                (batch_size, seq_len, NUM_HIDDEN + EMBED_DIM))
        else:
            to_next_layer = l_fwd_1

        to_next_layer_noise = L.dropout(to_next_layer, p=DROPOUT_RATE)

        in_gate = L.Gate(W_in=self.params['W_lstm2_xi'],
                         W_hid=self.params['W_lstm2_hi'],
                         W_cell=self.params['W_lstm2_ci'],
                         b=self.params['b_lstm2_i'],
                         nonlinearity=lasagne.nonlinearities.sigmoid)
        forget_gate = L.Gate(W_in=self.params['W_lstm2_xf'],
                             W_hid=self.params['W_lstm2_hf'],
                             W_cell=self.params['W_lstm2_cf'],
                             b=self.params['b_lstm2_f'],
                             nonlinearity=lasagne.nonlinearities.sigmoid)
        out_gate = L.Gate(W_in=self.params['W_lstm2_xo'],
                          W_hid=self.params['W_lstm2_ho'],
                          W_cell=self.params['W_lstm2_co'],
                          b=self.params['b_lstm2_o'],
                          nonlinearity=lasagne.nonlinearities.sigmoid)
        cell_gate = L.Gate(W_in=self.params['W_lstm2_xc'],
                           W_hid=self.params['W_lstm2_hc'],
                           W_cell=None,
                           b=self.params['b_lstm2_c'],
                           nonlinearity=lasagne.nonlinearities.tanh)
        l_fwd_2 = L.LSTMLayer(to_next_layer_noise,
                              NUM_HIDDEN,
                              ingate=in_gate,
                              forgetgate=forget_gate,
                              cell=cell_gate,
                              outgate=out_gate,
                              peepholes=True,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True)

        # slice final states of both lstm layers
        l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1)
        l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1)

        # g will be used to score the words based on their embeddings
        g = L.DenseLayer(L.concat([l_fwd_1_slice, l_fwd_2_slice], axis=1),
                         num_units=EMBED_DIM,
                         W=self.params['W_dense'],
                         b=self.params['b_dense'],
                         nonlinearity=lasagne.nonlinearities.tanh)

        ## get outputs
        #g_out = L.get_output(g) # B x D
        #g_out_val = L.get_output(g, deterministic=True) # B x D

        ## compute softmax probs
        #probs,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm),
        #                    outputs_info=None,
        #                    sequences=[g_out,docidx_var,docidx_mask],
        #                    non_sequences=self.params['W_emb'])
        #predicted_probs = probs.reshape(docidx_var.shape) # B x N
        #probs_val,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm),
        #                    outputs_info=None,
        #                    sequences=[g_out_val,docidx_var,docidx_mask],
        #                    non_sequences=self.params['W_emb'])
        #predicted_probs_val = probs_val.reshape(docidx_var.shape) # B x N
        #return predicted_probs, predicted_probs_val

        # W is shared with the lookup table
        l_out = L.DenseLayer(g,
                             num_units=vocab_size,
                             W=self.params['W_emb'].T,
                             nonlinearity=lasagne.nonlinearities.softmax,
                             b=None)
        return l_out
Beispiel #8
0
    def _build_net(self, emb_char_filter_size=5, emb_dropout=True, **kwargs):

        batch_size = self.mask_context_var.shape[0]
        context_len = self.mask_context_var.shape[1]
        question_len = self.question_var.shape[1]
        context_word_len = self.context_char_var.shape[2]
        question_word_len = self.question_char_var.shape[2]

        self.batch_size = batch_size
        self.context_len = context_len
        ''' Inputs and word embeddings'''

        l_context_char = LL.InputLayer(shape=(None, None, None),
                                       input_var=self.context_char_var)
        l_question_char = LL.InputLayer(shape=(None, None, None),
                                        input_var=self.question_char_var)

        l_c_mask = LL.InputLayer(shape=(None, None),
                                 input_var=self.mask_context_var)
        l_q_mask = LL.InputLayer(shape=(None, None),
                                 input_var=self.mask_question_var)

        l_c_char_mask = LL.InputLayer(shape=(None, None, None),
                                      input_var=self.mask_context_char_var)
        l_q_char_mask = LL.InputLayer(shape=(None, None, None),
                                      input_var=self.mask_question_char_var)

        l_c_emb = LL.InputLayer(shape=(None, None, self.emb_size),
                                input_var=self.context_var)
        l_q_emb = LL.InputLayer(shape=(None, None, self.emb_size),
                                input_var=self.question_var)

        if self.train_unk:
            l_c_unk_mask = LL.InputLayer(shape=(None, None),
                                         input_var=self.mask_context_unk_var)
            l_q_unk_mask = LL.InputLayer(shape=(None, None),
                                         input_var=self.mask_question_unk_var)

            l_c_emb = TrainUnkLayer(l_c_emb,
                                    l_c_unk_mask,
                                    output_size=self.emb_size,
                                    W=self.word_embeddings[0])

            l_q_emb = TrainUnkLayer(l_q_emb,
                                    l_q_unk_mask,
                                    output_size=self.emb_size,
                                    W=l_c_emb.W)

        if self.negative:
            l_c_emb = TrainNAWLayer(l_c_emb,
                                    l_c_mask,
                                    output_size=self.emb_size)
        ''' Char-embeddings '''

        # (batch_size x context_len x context_word_len x emb_char_size)
        l_c_char_emb = LL.EmbeddingLayer(l_context_char,
                                         input_size=self.alphabet_size,
                                         output_size=self.emb_char_size)

        l_q_char_emb = LL.EmbeddingLayer(l_question_char,
                                         input_size=self.alphabet_size,
                                         output_size=self.emb_char_size,
                                         W=l_c_char_emb.W)

        # here I do multiplication of character embeddings with masks,
        # because I want to pad them with constant zeros

        l_c_char_mask = ForgetSizeLayer(
            LL.dimshuffle(l_c_char_mask, (0, 1, 2, 'x')))
        l_q_char_mask = ForgetSizeLayer(
            LL.dimshuffle(l_q_char_mask, (0, 1, 2, 'x')))

        l_c_char_emb = LL.ElemwiseMergeLayer([l_c_char_emb, l_c_char_mask],
                                             T.mul)
        l_q_char_emb = LL.ElemwiseMergeLayer([l_q_char_emb, l_q_char_mask],
                                             T.mul)

        # convolutions

        l_c_char_emb = LL.dimshuffle(
            LL.reshape(l_c_char_emb, (batch_size * context_len,
                                      context_word_len, self.emb_char_size)),
            (0, 2, 1))
        l_c_char_conv = LL.Conv1DLayer(l_c_char_emb,
                                       num_filters=self.num_emb_char_filters,
                                       filter_size=emb_char_filter_size,
                                       nonlinearity=L.nonlinearities.tanh,
                                       pad=self.conv)
        # (batch_size * context_len x num_filters x context_word_len + filter_size - 1)

        l_c_char_emb = LL.ExpressionLayer(l_c_char_conv,
                                          lambda X: X.max(2),
                                          output_shape='auto')
        l_c_char_emb = LL.reshape(
            l_c_char_emb, (batch_size, context_len, self.num_emb_char_filters))

        l_q_char_emb = LL.dimshuffle(
            LL.reshape(l_q_char_emb, (batch_size * question_len,
                                      question_word_len, self.emb_char_size)),
            (0, 2, 1))
        l_q_char_conv = LL.Conv1DLayer(l_q_char_emb,
                                       num_filters=self.num_emb_char_filters,
                                       filter_size=emb_char_filter_size,
                                       nonlinearity=L.nonlinearities.tanh,
                                       W=l_c_char_conv.W,
                                       b=l_c_char_conv.b,
                                       pad=self.conv)
        # (batch_size * question_len x num_filters x question_word_len + filter_size - 1)

        l_q_char_emb = LL.ExpressionLayer(l_q_char_conv,
                                          lambda X: X.max(2),
                                          output_shape='auto')
        l_q_char_emb = LL.reshape(
            l_q_char_emb,
            (batch_size, question_len, self.num_emb_char_filters))
        ''' Concatenating both embeddings '''

        l_c_emb = LL.concat([l_c_emb, l_c_char_emb], axis=2)
        l_q_emb = LL.concat([l_q_emb, l_q_char_emb], axis=2)

        # originally I had dropout here
        ''' Highway layer allowing for interaction between embeddings '''

        l_c_P = LL.reshape(l_c_emb,
                           (batch_size * context_len,
                            self.emb_size + self.num_emb_char_filters))
        l_c_P = LL.DenseLayer(l_c_P,
                              num_units=self.rec_size,
                              b=None,
                              nonlinearity=None)

        l_c_high = HighwayLayer(l_c_P)
        l_c_emb = LL.reshape(l_c_high,
                             (batch_size, context_len, self.rec_size))

        l_q_P = LL.reshape(l_q_emb,
                           (batch_size * question_len,
                            self.emb_size + self.num_emb_char_filters))
        l_q_P = LL.DenseLayer(l_q_P,
                              num_units=self.rec_size,
                              W=l_c_P.W,
                              b=None,
                              nonlinearity=None)

        l_q_high = HighwayLayer(l_q_P,
                                W1=l_c_high.W1,
                                b1=l_c_high.b1,
                                W2=l_c_high.W2,
                                b2=l_c_high.b2)
        l_q_emb = LL.reshape(l_q_high,
                             (batch_size, question_len, self.rec_size))
        ''' Calculating wiq features from https://arxiv.org/abs/1703.04816 '''

        l_weighted_feat = WeightedFeatureLayer(
            [l_c_emb, l_q_emb, l_c_mask, l_q_mask])  # batch_size x context_len
        l_weighted_feat = LL.dimshuffle(l_weighted_feat, (0, 1, 'x'))

        # batch_size x context_len
        l_bin_feat = LL.InputLayer(shape=(None, None),
                                   input_var=self.bin_feat_var)
        l_bin_feat = LL.dimshuffle(l_bin_feat, (0, 1, 'x'))
        ''' Dropout at the embeddings '''

        if emb_dropout:
            print('Using dropout after wiq calculation.')
            l_c_emb = LL.dropout(l_c_emb)
            l_q_emb = LL.dropout(l_q_emb)
        ''' Here we concatenate wiq features to embeddings'''

        # both features are concatenated to the embeddings
        # for the question we fix the features to 1
        l_c_emb = LL.concat([l_c_emb, l_bin_feat, l_weighted_feat], axis=2)
        l_q_emb = LL.pad(l_q_emb,
                         width=[(0, 2)],
                         val=L.utils.floatX(1),
                         batch_ndim=2)
        ''' Context and question encoding using the same BiLSTM for both '''

        # output shape is (batch_size x context_len x rec_size)
        l_c_enc_forw = LL.LSTMLayer(l_c_emb,
                                    num_units=self.rec_size,
                                    grad_clipping=100,
                                    mask_input=l_c_mask)

        l_c_enc_back = LL.LSTMLayer(l_c_emb,
                                    num_units=self.rec_size,
                                    grad_clipping=100,
                                    mask_input=l_c_mask,
                                    backwards=True)

        # output shape is (batch_size x question_len x rec_size)
        l_q_enc_forw = LL.LSTMLayer(
            l_q_emb,
            num_units=self.rec_size,
            grad_clipping=100,
            mask_input=l_q_mask,
            ingate=LL.Gate(W_in=l_c_enc_forw.W_in_to_ingate,
                           W_hid=l_c_enc_forw.W_hid_to_ingate,
                           W_cell=l_c_enc_forw.W_cell_to_ingate,
                           b=l_c_enc_forw.b_ingate),
            forgetgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_forgetgate,
                               W_hid=l_c_enc_forw.W_hid_to_forgetgate,
                               W_cell=l_c_enc_forw.W_cell_to_forgetgate,
                               b=l_c_enc_forw.b_forgetgate),
            outgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_outgate,
                            W_hid=l_c_enc_forw.W_hid_to_outgate,
                            W_cell=l_c_enc_forw.W_cell_to_outgate,
                            b=l_c_enc_forw.b_outgate),
            cell=LL.Gate(W_in=l_c_enc_forw.W_in_to_cell,
                         W_hid=l_c_enc_forw.W_hid_to_cell,
                         W_cell=None,
                         b=l_c_enc_forw.b_cell,
                         nonlinearity=L.nonlinearities.tanh))

        l_q_enc_back = LL.LSTMLayer(
            l_q_emb,
            num_units=self.rec_size,
            grad_clipping=100,
            mask_input=l_q_mask,
            backwards=True,
            ingate=LL.Gate(W_in=l_c_enc_back.W_in_to_ingate,
                           W_hid=l_c_enc_back.W_hid_to_ingate,
                           W_cell=l_c_enc_back.W_cell_to_ingate,
                           b=l_c_enc_back.b_ingate),
            forgetgate=LL.Gate(W_in=l_c_enc_back.W_in_to_forgetgate,
                               W_hid=l_c_enc_back.W_hid_to_forgetgate,
                               W_cell=l_c_enc_back.W_cell_to_forgetgate,
                               b=l_c_enc_back.b_forgetgate),
            outgate=LL.Gate(W_in=l_c_enc_back.W_in_to_outgate,
                            W_hid=l_c_enc_back.W_hid_to_outgate,
                            W_cell=l_c_enc_back.W_cell_to_outgate,
                            b=l_c_enc_back.b_outgate),
            cell=LL.Gate(W_in=l_c_enc_back.W_in_to_cell,
                         W_hid=l_c_enc_back.W_hid_to_cell,
                         W_cell=None,
                         b=l_c_enc_back.b_cell,
                         nonlinearity=L.nonlinearities.tanh))

        # batch_size x context_len  x 2*rec_size
        l_c_enc = LL.concat([l_c_enc_forw, l_c_enc_back], axis=2)
        # batch_size x question_len x 2*rec_size
        l_q_enc = LL.concat([l_q_enc_forw, l_q_enc_back], axis=2)

        def proj_init():
            return np.vstack([
                np.eye(self.rec_size, dtype=theano.config.floatX),
                np.eye(self.rec_size, dtype=theano.config.floatX)
            ])

        # this is H from the paper, shape: (batch_size * context_len x
        # rec_size)
        l_c_proj = LL.reshape(l_c_enc,
                              (batch_size * context_len, 2 * self.rec_size))
        l_c_proj = LL.DenseLayer(l_c_proj,
                                 num_units=self.rec_size,
                                 W=proj_init(),
                                 b=None,
                                 nonlinearity=L.nonlinearities.tanh)

        # this is Z from the paper, shape: (batch_size * question_len x
        # rec_size)
        l_q_proj = LL.reshape(l_q_enc,
                              (batch_size * question_len, 2 * self.rec_size))
        l_q_proj = LL.DenseLayer(l_q_proj,
                                 num_units=self.rec_size,
                                 W=proj_init(),
                                 b=None,
                                 nonlinearity=L.nonlinearities.tanh)
        ''' Additional, weighted question encoding (alphas from paper) '''

        l_alpha = LL.DenseLayer(
            l_q_proj,  # batch_size * question_len x 1
            num_units=1,
            b=None,
            nonlinearity=None)

        # batch_size x question_len
        l_alpha = MaskedSoftmaxLayer(
            LL.reshape(l_alpha, (batch_size, question_len)), l_q_mask)

        # batch_size x rec_size
        l_z_hat = BatchedDotLayer([
            LL.reshape(l_q_proj, (batch_size, question_len, self.rec_size)),
            l_alpha
        ])

        return l_c_proj, l_z_hat
Beispiel #9
0
def l1lstm_l2d(input_dim,
               output_dim,
               n_hidden,
               nonlinearity=lasagne.nonlinearities.tanh,
               layer_type=layers.LSTMLayer,
               learning_rate=1e-4,
               wl2=0.,
               wl1=0,
               r_reg_coeff=0.,
               grad_clipping=0.,
               bidirectional=False,
               loss_type='MSE',
               skip_connection=False,
               **kwargs):

    # Specify the number of steps used before computing the gradient
    if 'gradient_steps' not in kwargs:
        gradient_steps = -1
    else:
        gradient_steps = kwargs.pop('gradient_steps')

    target = T.tensor3()
    target.name = 'target'

    # Input Layer
    l_in = layers.InputLayer((None, None, input_dim))
    input_layer = l_in
    if bidirectional:
        input_layer_b = l_in

    if skip_connection:
        # Input to output connection
        l_in_to_out = lasagne.layers.DenseLayer(lasagne.layers.ReshapeLayer(
            l_in, (-1, input_dim)),
                                                output_dim,
                                                nonlinearity=None,
                                                name='in_to_out')

    b_size, seqlen, _ = l_in.input_var.shape

    lstm_layers = (layers.LSTMLayer, c_layers.MILSTMLayer,
                   c_layers.BatchNormLSTMLayer)
    gru_layers = (layers.GRULayer, c_layers.MIGRULayer)
    if layer_type in lstm_layers:
        print 'Using {0}'.format(layer_type)
        name = 'lstm'
        l_r_f = layer_type(incoming=input_layer,
                           num_units=n_hidden,
                           nonlinearity=nonlinearity,
                           gradient_steps=gradient_steps,
                           name=name,
                           **kwargs)
        if bidirectional:
            print 'Using bidirectional network'
            l_r_b = layer_type(incoming=input_layer_b,
                               num_units=n_hidden,
                               nonlinearity=nonlinearity,
                               gradient_steps=gradient_steps,
                               name=name + '_b',
                               backwards=True,
                               **kwargs)

    elif layer_type is layers.GRULayer:
        print 'Using {0}'.format(layer_type)
        name = 'gru'
        l_r_f = layer_type(
            incoming=input_layer,
            num_units=n_hidden,
            hidden_update=layers.Gate(nonlinearity=nonlinearity),
            gradient_steps=gradient_steps,
            name=name,
            **kwargs)

        if bidirectional:
            print 'Using bidirectional network'
            l_r_b = layer_type(
                incoming=input_layer_b,
                num_units=n_hidden,
                hidden_update=layers.Gate(nonlinearity=nonlinearity),
                gradient_steps=gradient_steps,
                name=name + '_b',
                backwards=True,
                **kwargs)

    elif layer_type is c_layers.MIGRULayer:
        print 'Using {0}'.format(layer_type)
        name = 'gru'
        l_r_f = layer_type(
            incoming=input_layer,
            num_units=n_hidden,
            hidden_update=c_layers.MIGate(nonlinearity=nonlinearity),
            gradient_steps=gradient_steps,
            name=name,
            **kwargs)
        if bidirectional:
            print 'Using bidirectional network'
            l_r_b = layer_type(
                incoming=input_layer_b,
                num_units=n_hidden,
                hidden_update=c_layers.MIGate(nonlinearity=nonlinearity),
                gradient_steps=gradient_steps,
                name=name + '_b',
                backwards=True,
                **kwargs)

    else:
        print 'Invalid layer_type {0}'.format(layer_type)

    l_concat = l_r_f
    out_shape = n_hidden

    if bidirectional:
        print 'Concatenating Forward and Backward recurrent layers'
        l_concat = layers.ConcatLayer((l_concat, l_r_b), axis=-1)
        out_shape = out_shape + n_hidden
    l_re = layers.ReshapeLayer(l_concat, (-1, out_shape), name='reshape')

    if loss_type == 'MSE':
        print 'Using MSE'
    l_d = layers.DenseLayer(l_re, output_dim, nonlinearity=None, name='dense')

    if skip_connection:
        # Combine input_to_output and hidden to output layers
        l_output = lasagne.layers.ElemwiseSumLayer([l_in_to_out, l_d])

    else:
        l_output = l_d

    if kwargs.get('only_return_final', False):
        out_shape = (b_size, 1, output_dim)
    else:
        out_shape = (b_size, seqlen, output_dim)

    l_out = layers.ReshapeLayer(l_output, out_shape)

    deterministic_out = layers.get_output(l_out, deterministic=True)
    deterministic_out.name = 'deterministic out'
    stochastic_out = layers.get_output(l_out)
    stochastic_out.name = 'stochastic out'

    params = layers.get_all_params(l_out, trainable=True)

    if layer_type in lstm_layers:
        # Get regularizable parameters of the LSTM
        reg_params_norm = [
            l_r_f.W_in_to_cell, l_r_f.W_in_to_forgetgate, l_r_f.W_in_to_ingate,
            l_r_f.W_in_to_outgate
        ]
        reg_params_rec = [
            l_r_f.W_hid_to_cell, l_r_f.W_hid_to_forgetgate,
            l_r_f.W_hid_to_ingate, l_r_f.W_hid_to_outgate
        ]
        if bidirectional:
            reg_params_norm += [
                l_r_b.W_in_to_cell, l_r_b.W_in_to_forgetgate,
                l_r_b.W_in_to_ingate, l_r_b.W_in_to_outgate
            ]
            reg_params_rec += [
                l_r_b.W_hid_to_cell, l_r_b.W_hid_to_forgetgate,
                l_r_b.W_hid_to_ingate, l_r_b.W_hid_to_outgate
            ]
    elif layer_type in gru_layers:
        # Get regularizable parameters of the GRU
        reg_params_norm = [
            l_r_f.W_in_to_updategate, l_r_f.W_in_to_resetgate,
            l_r_f.W_in_to_hidden_update
        ]
        reg_params_rec = [
            l_r_f.W_hid_to_updategate, l_r_f.W_hid_to_resetgate,
            l_r_f.W_hid_to_hidden_update
        ]

        if bidirectional:
            reg_params_norm += [
                l_r_b.W_in_to_updategate, l_r_b.W_in_to_resetgate,
                l_r_b.W_in_to_hidden_update
            ]
            reg_params_rec += [
                l_r_b.W_hid_to_updategate, l_r_b.W_hid_to_resetgate,
                l_r_b.W_hid_to_hidden_update
            ]

    if wl2 > 0:
        print 'Using L2 norm regularization'
        weight_reg = wl2 * (sum([T.mean(p**2)
                                 for p in reg_params_norm]) + T.mean(l_d.W**2))

        if skip_connection:
            weight_reg += wl2 * T.mean(l_in_to_out.W**2)

    else:
        weight_reg = 0.

    if wl1 > 0:
        print 'Using L1 norm regularization'
        weight_reg += wl1 * (sum([T.mean(p**2)
                                  for p in reg_params_norm]) + T.mean(l_d.W))
        if skip_connection:
            weight_reg += wl1 * T.mean(abs(l_in_to_out.W))

    if r_reg_coeff > 0:
        print 'Using hid to hid eigenvalue regularization'
        weight_reg += r_reg_coeff * sum(
            [T.mean((T.nlinalg.eigh(p)[0] - 1.)**2) for p in reg_params_rec])

    stochastic_loss = (
        lasagne.objectives.squared_error(stochastic_out, target).mean() +
        weight_reg)

    stochastic_loss.name = 'stochastic MSE (regularized)'

    deterministic_loss = T.mean(
        lasagne.objectives.squared_error(deterministic_out, target))
    deterministic_loss.name = 'MSE'

    updates = lasagne.updates.rmsprop(stochastic_loss,
                                      params,
                                      learning_rate=learning_rate)

    train_loss = [stochastic_loss]
    valid_loss = [deterministic_loss]

    return dict(l_in=l_in,
                l_out=l_out,
                train_loss=train_loss,
                valid_loss=valid_loss,
                target=target,
                updates=updates,
                predictions=deterministic_out,
                gradient_steps=gradient_steps,
                model_type='RNN')