Ejemplo n.º 1
0
def build_rnn(x_sym, hid_init_sym, hid2_init_sym, seq_length, vocab_size,
              rnn_size):

    l_input = L.InputLayer(input_var=x_sym, shape=(None, seq_length))
    l_input_hid = L.InputLayer(input_var=hid_init_sym, shape=(None, rnn_size))
    l_input_hid2 = L.InputLayer(input_var=hid2_init_sym,
                                shape=(None, rnn_size))

    l_input = L.EmbeddingLayer(l_input,
                               input_size=vocab_size,
                               output_size=rnn_size)

    l_rnn = L.LSTMLayer(l_input, num_units=rnn_size,
                        hid_init=l_input_hid)  #, cell_init=l_init_cell)
    h = L.DropoutLayer(l_rnn, p=dropout_prob)
    l_rnn2 = L.LSTMLayer(h, num_units=rnn_size,
                         hid_init=l_input_hid2)  #, cell_init=l_init_cell2)
    h = L.DropoutLayer(l_rnn2, p=dropout_prob)

    # Before the decoder layer, we need to reshape the sequence into the batch dimension,
    # so that timesteps are decoded independently.
    l_shp = L.ReshapeLayer(h, (-1, rnn_size))

    pred = NCELayer(l_shp, num_units=vocab_size, Z=Z)
    pred = L.ReshapeLayer(pred, (-1, seq_length, vocab_size))
    return l_rnn, l_rnn2, pred
    def build_network(self, vocab_size, input_var, mask_var, W_init):

        l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var)
        l_mask = L.InputLayer(shape=(None, None), input_var=mask_var)
        l_embed = L.EmbeddingLayer(l_in,
                                   input_size=vocab_size,
                                   output_size=EMBED_DIM,
                                   W=W_init)

        l_fwd_1 = L.LSTMLayer(l_embed,
                              NUM_HIDDEN,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True)
        l_bkd_1 = L.LSTMLayer(l_embed,
                              NUM_HIDDEN,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True,
                              backwards=True)

        l_all_1 = L.concat([l_fwd_1, l_bkd_1], axis=2)

        l_fwd_2 = L.LSTMLayer(l_all_1,
                              NUM_HIDDEN,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True)
        l_bkd_2 = L.LSTMLayer(l_all_1,
                              NUM_HIDDEN,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True,
                              backwards=True)

        l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1)
        l_bkd_1_slice = L.SliceLayer(l_bkd_1, 0, 1)
        y_1 = L.ElemwiseSumLayer([l_fwd_1_slice, l_bkd_1_slice])

        l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1)
        l_bkd_2_slice = L.SliceLayer(l_bkd_2, 0, 1)
        y_2 = L.ElemwiseSumLayer([l_fwd_2_slice, l_bkd_2_slice])

        y = L.concat([y_1, y_2], axis=1)
        g = L.DenseLayer(y,
                         num_units=EMBED_DIM,
                         nonlinearity=lasagne.nonlinearities.tanh)
        l_out = L.DenseLayer(g,
                             num_units=vocab_size,
                             W=l_embed.W.T,
                             nonlinearity=lasagne.nonlinearities.softmax)

        return l_out
Ejemplo n.º 3
0
    def __init__(
        self,
        n_words,
        dim_emb,
        num_units,
        n_classes,
        w_emb=None,
        dropout=0.2,
        use_final=False,
        lr=0.001,
        pretrain=None,
    ):
        self.n_words = n_words
        self.dim_emb = dim_emb
        self.num_units = num_units
        self.n_classes = n_classes
        self.lr = lr

        if w_emb is None:
            w_emb = init.Normal()

        self.l_x = layers.InputLayer((None, None))
        self.l_m = layers.InputLayer((None, None))
        self.l_emb = layers.EmbeddingLayer(self.l_x, n_words, dim_emb, W=w_emb)
        self.l_ebd = self.l_emb

        if dropout:
            self.l_emb = layers.dropout(self.l_emb, dropout)

        if use_final:
            self.l_enc = layers.LSTMLayer(self.l_emb,
                                          num_units,
                                          mask_input=self.l_m,
                                          only_return_final=True,
                                          grad_clipping=10.0,
                                          gradient_steps=400)
            self.l_rnn = self.l_enc
        else:
            self.l_enc = layers.LSTMLayer(self.l_emb,
                                          num_units,
                                          mask_input=self.l_m,
                                          only_return_final=False,
                                          grad_clipping=10.0,
                                          gradient_steps=400)
            self.l_rnn = self.l_enc
            self.l_enc = MeanLayer(self.l_enc, self.l_m)

        if dropout:
            self.l_enc = layers.dropout(self.l_enc, dropout)

        self.l_y = layers.DenseLayer(self.l_enc,
                                     n_classes,
                                     nonlinearity=nonlinearities.softmax)

        if pretrain:
            self.load_pretrain(pretrain)
Ejemplo n.º 4
0
def recurrent(input_var=None,
              num_units=512,
              batch_size=64,
              seq_length=1,
              grad_clip=100):
    recurrent = []

    theano_rng = RandomStreams(rng.randint(2**15))
    # we want noise to match tanh range of activation ([-1,1])
    noise = theano_rng.uniform(size=(batch_size, seq_length, num_units),
                               low=-1.0,
                               high=1.0)
    input_var = noise if input_var is None else input_var

    recurrent.append(
        ll.InputLayer(shape=(batch_size, seq_length, num_units),
                      input_var=input_var))

    recurrent.append(
        ll.LSTMLayer(recurrent[-1], num_units,
                     grad_clipping=grad_clip))  #tanh is default

    recurrent.append(ll.SliceLayer(recurrent[-1], -1, 1))

    recurrent.append(ll.ReshapeLayer(recurrent[-1], ([0], 1, [1])))

    for layer in recurrent:
        print layer.output_shape
    print ""

    return recurrent
Ejemplo n.º 5
0
    def __init__(self,
                 input,
                 n_hidden=500,
                 grad_clip=100.,
                 only_return_final=True):

        self.input = input

        gate_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(),
                                      W_hid=lasagne.init.Orthogonal(),
                                      b=initialize_parameters()[1])

        cell_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(),
                                      W_hid=lasagne.init.Orthogonal(),
                                      W_cell=None,
                                      b=initialize_parameters()[1],
                                      nonlinearity=lasagne.nonlinearities.tanh)

        self.output = layers.LSTMLayer(self.input,
                                       n_hidden,
                                       ingate=gate_parameters,
                                       forgetgate=gate_parameters,
                                       cell=cell_parameters,
                                       outgate=gate_parameters,
                                       grad_clipping=grad_clip,
                                       only_return_final=only_return_final)
def ptb_lstm(input_var, vocabulary_size, hidden_size, seq_len, num_layers,
             dropout, batch_size):
    l_input = L.InputLayer(shape=(batch_size, seq_len), input_var=input_var)
    l_embed = L.EmbeddingLayer(l_input,
                               vocabulary_size,
                               hidden_size,
                               W=init.Uniform(1.0))
    l_lstms = []
    for i in range(num_layers):
        l_lstm = L.LSTMLayer(l_embed if i == 0 else l_lstms[-1],
                             hidden_size,
                             ingate=L.Gate(W_in=init.GlorotUniform(),
                                           W_hid=init.Orthogonal()),
                             forgetgate=L.Gate(W_in=init.GlorotUniform(),
                                               W_hid=init.Orthogonal(),
                                               b=init.Constant(1.0)),
                             cell=L.Gate(
                                 W_in=init.GlorotUniform(),
                                 W_hid=init.Orthogonal(),
                                 W_cell=None,
                                 nonlinearity=lasagne.nonlinearities.tanh),
                             outgate=L.Gate(W_in=init.GlorotUniform(),
                                            W_hid=init.Orthogonal()))
        l_lstms.append(l_lstm)
    l_drop = L.DropoutLayer(l_lstms[-1], dropout)
    l_out = L.DenseLayer(l_drop, num_units=vocabulary_size, num_leading_axes=2)
    l_out = L.ReshapeLayer(
        l_out,
        (l_out.output_shape[0] * l_out.output_shape[1], l_out.output_shape[2]))
    l_out = L.NonlinearityLayer(l_out,
                                nonlinearity=lasagne.nonlinearities.softmax)
    return l_out
Ejemplo n.º 7
0
def layer_LSTM(l_hid,
               hiddensize,
               nonlinearity,
               backwards=False,
               grad_clipping=50,
               name=""):
    '''
    That's a custom LSTM layer that seems to converge faster.
    '''
    ingate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0),
                     W_hid=lasagne.init.Orthogonal(1.0))
    forgetgate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0),
                         W_hid=lasagne.init.Orthogonal(1.0))
    outgate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0),
                      W_hid=lasagne.init.Orthogonal(1.0))
    cell = ll.Gate(W_cell=None,
                   W_in=lasagne.init.Orthogonal(1.0),
                   W_hid=lasagne.init.Orthogonal(1.0),
                   nonlinearity=nonlinearity)
    # The final nonline should be TanH otherwise it doesn't converge (why?)
    # by default peepholes=True
    fwd = ll.LSTMLayer(l_hid,
                       num_units=hiddensize,
                       backwards=backwards,
                       ingate=ingate,
                       forgetgate=forgetgate,
                       outgate=outgate,
                       cell=cell,
                       grad_clipping=grad_clipping,
                       nonlinearity=lasagne.nonlinearities.tanh,
                       name=name)

    return fwd
def makeRNN(xInputRNN, hiddenInitRNN, hidden2InitRNN, sequenceLen, vocabularySize, neuralNetworkSz):

	input_Layer = L.InputLayer(input_var = xInputRNN, shape = (None, sequenceLen))
	hidden_Layer = L.InputLayer(input_var = hiddenInitRNN, shape = (None, neuralNetworkSz))
	hidden_Layer2 = L.InputLayer(input_var = hidden2InitRNN, shape = (None, neuralNetworkSz))
	input_Layer = L.EmbeddingLayer(input_Layer, input_size = vocabularySize, output_size = neuralNetworkSz)

	RNN_Layer = L.LSTMLayer(input_Layer, num_units = neuralNetworkSz, hid_init = hidden_Layer)
	h = L.DropoutLayer(RNN_Layer, p = dropOutProbability)
	RNN_Layer2 = L.LSTMLayer(h, num_units = neuralNetworkSz, hid_init = hidden_Layer2)
	h = L.DropoutLayer(RNN_Layer2, p = dropOutProbability)

	layerShape = L.ReshapeLayer(h, (-1, neuralNetworkSz))
	
	predictions = NCE(layerShape, num_units = vocabularySize, Z = Z)
	predictions = L.ReshapeLayer(predictions, (-1, sequenceLen, vocabularySize))
	return RNN_Layer, RNN_Layer2, predictions
Ejemplo n.º 9
0
    def __init__(self,
                 incoming,
                 num_units,
                 mask_input,
                 grad_clipping=0,
                 **kwargs):

        incomings = [incoming, mask_input]
        super(MeanLstmLayer, self).__init__(incomings)
        self.num_units = num_units
        self.lstm_layer = layers.LSTMLayer(incoming,
                                           num_units=self.num_units,
                                           mask_input=mask_input,
                                           grad_clipping=grad_clipping,
                                           **kwargs)
Ejemplo n.º 10
0
def integrate_captions(input_var=T.imatrix()):
    '''
            :param batch_size: number of images
            :param nb_caption: number of caption used per image
    '''

    ###############################
    # Build Network Configuration #
    ###############################

    print('... Integrating captions to the model')

    # Input of the network : shape = (nb_caption, seq_length)
    network = layers.InputLayer(shape=(None, None), input_var=input_var)

    # Embedding layer : shape = (nb_caption, seq_length, 400)
    vocab_length = get_vocab_length()
    network = layers.EmbeddingLayer(network, vocab_length, output_size=400)

    # LSTM layer : shape = (nb_caption, 500)
    gate_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(),
                                  W_hid=lasagne.init.Orthogonal(),
                                  b=lasagne.init.Constant(0.))

    cell_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(),
                                  W_hid=lasagne.init.Orthogonal(),
                                  W_cell=None,
                                  b=lasagne.init.Constant(0.),
                                  nonlinearity=nonlinearities.tanh)

    network = layers.LSTMLayer(network,
                               num_units=500,
                               ingate=gate_parameters,
                               forgetgate=gate_parameters,
                               cell=cell_parameters,
                               outgate=gate_parameters,
                               grad_clipping=100.,
                               only_return_final=True)

    # Dense Layer : shape = (nb_caption, 500)
    network = layers.DenseLayer(network, num_units=500)

    # Reshape layer : shape = (nb_caption, 500, 1, 1)
    network = layers.ReshapeLayer(network, (-1, 500, 1, 1))

    return network
Ejemplo n.º 11
0
def clone(src_net, dst_net, mask_input):
    """
    Clones a lasagne neural network, keeping weights tied.

    For all layers of src_net in turn, starting at the first:
     1. creates a copy of the layer,
     2. reuses the original objects for weights and
     3. appends the new layer to dst_net.

    InputLayers are ignored.
    Recurrent layers (LSTMLayer) are passed mask_input.
    """
    logger.info("Net to be cloned:")
    for l in layers.get_all_layers(src_net):
        logger.info(" - {} ({}):".format(l.name, l))

    logger.info("Starting to clone..")
    for l in layers.get_all_layers(src_net):
        logger.info("src_net[...]: {} ({}):".format(l.name, l))
        if type(l) == layers.InputLayer:
            logger.info(' - skipping')
            continue
        if type(l) == layers.DenseLayer:
            dst_net = layers.DenseLayer(
                dst_net,
                num_units=l.num_units,
                W=l.W,
                b=l.b,
                nonlinearity=l.nonlinearity,
                name=l.name+'2',
            )
        elif type(l) == layers.EmbeddingLayer:
            dst_net = layers.EmbeddingLayer(
                dst_net,
                l.input_size,
                l.output_size,
                W=l.W,
                name=l.name+'2',
            )
        elif type(l) == layers.LSTMLayer:
            dst_net = layers.LSTMLayer(
                dst_net,
                l.num_units,
                ingate=layers.Gate(
                    W_in=l.W_in_to_ingate,
                    W_hid=l.W_hid_to_ingate,
                    W_cell=l.W_cell_to_ingate,
                    b=l.b_ingate,
                    nonlinearity=l.nonlinearity_ingate
                ),
                forgetgate=layers.Gate(
                    W_in=l.W_in_to_forgetgate,
                    W_hid=l.W_hid_to_forgetgate,
                    W_cell=l.W_cell_to_forgetgate,
                    b=l.b_forgetgate,
                    nonlinearity=l.nonlinearity_forgetgate
                ),
                cell=layers.Gate(
                    W_in=l.W_in_to_cell,
                    W_hid=l.W_hid_to_cell,
                    W_cell=None,
                    b=l.b_cell,
                    nonlinearity=l.nonlinearity_cell
                ),
                outgate=layers.Gate(
                    W_in=l.W_in_to_outgate,
                    W_hid=l.W_hid_to_outgate,
                    W_cell=l.W_cell_to_outgate,
                    b=l.b_outgate,
                    nonlinearity=l.nonlinearity_outgate
                ),
                nonlinearity=l.nonlinearity,
                cell_init=l.cell_init,
                hid_init=l.hid_init,
                backwards=l.backwards,
                learn_init=l.learn_init,
                peepholes=l.peepholes,
                gradient_steps=l.gradient_steps,
                grad_clipping=l.grad_clipping,
                unroll_scan=l.unroll_scan,
                precompute_input=l.precompute_input,
                # mask_input=l.mask_input, # AttributeError: 'LSTMLayer' object has no attribute 'mask_input'
                name=l.name+'2',
                mask_input=mask_input,
            )
        elif type(l) == layers.SliceLayer:
            dst_net = layers.SliceLayer(
                dst_net,
                indices=l.slice,
                axis=l.axis,
                name=l.name+'2',
            )
        else:
            raise ValueError("Unhandled layer: {}".format(l))
        new_layer = layers.get_all_layers(dst_net)[-1]
        logger.info('dst_net[...]: {} ({})'.format(new_layer, new_layer.name))

    logger.info("Result of cloning:")
    for l in layers.get_all_layers(dst_net):
        logger.info(" - {} ({}):".format(l.name, l))

    return dst_net
Ejemplo n.º 12
0
input_sequence = T.matrix('token sequencea', 'int32')
input_mask = T.neq(input_sequence, src_voc.PAD)

target_values = T.matrix('actual next token', 'int32')
target_mask = T.neq(target_values, dst_voc.PAD)

CODE_SIZE = 512

l_in = lasagne.layers.InputLayer(shape=(None, None), input_var=input_sequence)
l_mask = lasagne.layers.InputLayer(shape=(None, None), input_var=input_mask)

#encoder
l_emb = L.EmbeddingLayer(l_in, src_voc.len, 128)

l_rnn = L.LSTMLayer(l_emb, 256, nonlinearity=T.tanh, mask_input=l_mask)
l_rnn = L.concat([l_emb, l_rnn], axis=-1)
l_encoded = l_rnn = L.LSTMLayer(l_rnn,
                                CODE_SIZE,
                                nonlinearity=T.tanh,
                                mask_input=l_mask)

l_trans = L.InputLayer((None, None), input_var=target_values[:, :-1])
l_trans_mask = L.InputLayer((None, None), input_var=target_mask[:, :-1])

from agentnet.agent.recurrence import Recurrence
from agentnet.memory import AttentionLayer, LSTMCell
from agentnet.resolver import ProbabilisticResolver, GreedyResolver


class AutoLSTMCell:
Ejemplo n.º 13
0
    def buildModel(self):
        print(' -- Building...')
        x_init = sparse.csr_matrix('x', dtype='float32')
        y_init = T.imatrix('y')
        gx_init = sparse.csr_matrix('gx', dtype='float32')
        gy_init = T.ivector('gy')
        gz_init = T.vector('gz')
        mask_init = T.fmatrix('subMask')

        # step train
        x_input = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                 input_var=x_init)
        x_to_label = layers.SparseLayer(x_input, self.y.shape[1],
                                        nonlinearity=lg.nonlinearities.softmax)
        x_to_emd = layers.SparseLayer(x_input, self.embedding_size)
        W = x_to_emd.W
        x_to_emd = layers.DenseLayer(x_to_emd, self.y.shape[1],
                                     nonlinearity=lg.nonlinearities.softmax)
        x_concat = lgl.ConcatLayer([x_to_label, x_to_emd], axis=1)
        x_concat = layers.DenseLayer(x_concat, self.y.shape[1],
                                     nonlinearity=lg.nonlinearities.softmax)
        pred = lgl.get_output(x_concat)
        step_loss = lgo.categorical_crossentropy(pred, y_init).mean()
        hid_loss = lgl.get_output(x_to_label)
        step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean()
        emd_loss = lgl.get_output(x_to_emd)
        step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean()
        step_params = lgl.get_all_params(x_concat)
        step_updates = lg.updates.sgd(step_loss, step_params,
                                      learning_rate=self.step_learning_rate)
        self.step_train = theano.function([x_init, y_init], step_loss,
                                          updates=step_updates)
        self.test_fn = theano.function([x_init], pred)

        # supervised train
        gx_input = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                  input_var=gx_init)
        gx_to_emd = layers.SparseLayer(gx_input, self.embedding_size, W=W)
        gx_to_emd = lgl.DenseLayer(gx_to_emd, self.num_ver,
                                   nonlinearity=lg.nonlinearities.softmax)
        gx_pred = lgl.get_output(gx_to_emd)
        g_loss = lgo.categorical_crossentropy(gx_pred, gy_init).sum()
        sup_params = lgl.get_all_params(gx_to_emd)
        sup_updates = lg.updates.sgd(g_loss, sup_params,
                                     learning_rate=self.sup_learning_rate)
        self.sup_train = theano.function([gx_init, gy_init, gz_init], g_loss,
                                         updates=sup_updates,
                                         on_unused_input='ignore')

        # handle lstm input
        cross_entropy = lgo.categorical_crossentropy(gx_pred, gy_init)
        cross_entropy = T.reshape(cross_entropy, (1, self.subpath_num), ndim=None)
        mask_input = lgl.InputLayer(shape=(None, self.window_size + 1),
                                    input_var=mask_init)
        sub_path_batch1 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input1 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch1)
        sub_path_batch2 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input2 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch2)
        sub_path_batch3 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input3 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch3)
        sub_path_batch4 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input4 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch4)
        sub_path_emd1 = layers.SparseLayer(sub_path_input1, self.embedding_size,
                                           W=W)
        sub_path_emd1 = T.reshape(lgl.get_output(sub_path_emd1),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_emd2 = layers.SparseLayer(sub_path_input2,
                                           self.embedding_size, W=W)
        sub_path_emd2 = T.reshape(lgl.get_output(sub_path_emd2),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_emd3 = layers.SparseLayer(sub_path_input3, self.embedding_size,
                                           W=W)
        sub_path_emd3 = T.reshape(lgl.get_output(sub_path_emd3),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_emd4 = layers.SparseLayer(sub_path_input4, self.embedding_size,
                                           W=W)
        sub_path_emd4 = T.reshape(lgl.get_output(sub_path_emd4),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_concat = T.concatenate([sub_path_emd1, sub_path_emd2,
                                         sub_path_emd3, sub_path_emd4], axis=1)
        sub_path_concat_layer = lgl.InputLayer(shape=(None, self.window_size + 1,
                                                      self.embedding_size),
                                               input_var=sub_path_concat)

        # lstm layer
        lstm_layer = lgl.LSTMLayer(sub_path_concat_layer,
                                   self.lstm_hidden_units,
                                   grad_clipping=3,
                                   mask_input=mask_input)

        # handle path weight
        max1 = T.mean(lgl.get_output(lstm_layer), axis=1)
        max2 = T.mean(max1, axis=1)
        max2_init = T.fcol('max2')
        max2_init = T.reshape(max2, ((self.subpath_num, 1)))
        max2_input = lgl.InputLayer(shape=(self.subpath_num, 1),
                                    input_var=max2_init)
        max2_input = lgl.BatchNormLayer(max2_input)
        path_weight = lgl.get_output(max2_input)
        path_weight = lg.nonlinearities.sigmoid(path_weight)
        path_weight = 1 + 0.3 * path_weight

        # unsupervised train
        reweight_loss = T.dot(cross_entropy, path_weight)[0][0]
        lstm_params = lgl.get_all_params(lstm_layer, trainable=True)
        lstm_updates = lg.updates.sgd(reweight_loss, lstm_params,
                                      learning_rate=0.01)
        self.lstm_fn = theano.function([gx_init, gy_init, gz_init,
                                        sub_path_batch1, sub_path_batch2,
                                        sub_path_batch3, sub_path_batch4,
                                        mask_init],
                                       reweight_loss,
                                       updates=lstm_updates,
                                       on_unused_input='ignore')
        alpha_updates = lg.updates.sgd(reweight_loss, sup_params,
                                       learning_rate=0.001)
        self.alpha_fn = theano.function([gx_init, gy_init, gz_init,
                                         sub_path_batch1, sub_path_batch2,
                                         sub_path_batch3, sub_path_batch4,
                                         mask_init],
                                        reweight_loss,
                                        updates=alpha_updates,
                                        on_unused_input='ignore')

        print(' -- Done!')
Ejemplo n.º 14
0
    def build_network(self,
                      vocab_size,
                      input_var,
                      mask_var,
                      docidx_var,
                      docidx_mask,
                      skip_connect=True):

        l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var)

        l_mask = L.InputLayer(shape=(None, None), input_var=mask_var)

        l_embed = L.EmbeddingLayer(l_in,
                                   input_size=vocab_size,
                                   output_size=EMBED_DIM,
                                   W=self.params['W_emb'])

        l_embed_noise = L.dropout(l_embed, p=DROPOUT_RATE)

        # NOTE: Moved initialization of forget gate biases to init_params
        #forget_gate_1 = L.Gate(b=lasagne.init.Constant(3))
        #forget_gate_2 = L.Gate(b=lasagne.init.Constant(3))

        # NOTE: LSTM layer provided by Lasagne is slightly different from that used in DeepMind's paper.
        # In the paper the cell-to-* weights are not diagonal.
        # the 1st lstm layer
        in_gate = L.Gate(W_in=self.params['W_lstm1_xi'],
                         W_hid=self.params['W_lstm1_hi'],
                         W_cell=self.params['W_lstm1_ci'],
                         b=self.params['b_lstm1_i'],
                         nonlinearity=lasagne.nonlinearities.sigmoid)
        forget_gate = L.Gate(W_in=self.params['W_lstm1_xf'],
                             W_hid=self.params['W_lstm1_hf'],
                             W_cell=self.params['W_lstm1_cf'],
                             b=self.params['b_lstm1_f'],
                             nonlinearity=lasagne.nonlinearities.sigmoid)
        out_gate = L.Gate(W_in=self.params['W_lstm1_xo'],
                          W_hid=self.params['W_lstm1_ho'],
                          W_cell=self.params['W_lstm1_co'],
                          b=self.params['b_lstm1_o'],
                          nonlinearity=lasagne.nonlinearities.sigmoid)
        cell_gate = L.Gate(W_in=self.params['W_lstm1_xc'],
                           W_hid=self.params['W_lstm1_hc'],
                           W_cell=None,
                           b=self.params['b_lstm1_c'],
                           nonlinearity=lasagne.nonlinearities.tanh)
        l_fwd_1 = L.LSTMLayer(l_embed_noise,
                              NUM_HIDDEN,
                              ingate=in_gate,
                              forgetgate=forget_gate,
                              cell=cell_gate,
                              outgate=out_gate,
                              peepholes=True,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True)

        # the 2nd lstm layer
        if skip_connect:
            # construct skip connection from the lookup table to the 2nd layer
            batch_size, seq_len, _ = input_var.shape
            # concatenate the last dimension of l_fwd_1 and embed
            l_fwd_1_shp = L.ReshapeLayer(l_fwd_1, (-1, NUM_HIDDEN))
            l_embed_shp = L.ReshapeLayer(l_embed, (-1, EMBED_DIM))
            to_next_layer = L.ReshapeLayer(
                L.concat([l_fwd_1_shp, l_embed_shp], axis=1),
                (batch_size, seq_len, NUM_HIDDEN + EMBED_DIM))
        else:
            to_next_layer = l_fwd_1

        to_next_layer_noise = L.dropout(to_next_layer, p=DROPOUT_RATE)

        in_gate = L.Gate(W_in=self.params['W_lstm2_xi'],
                         W_hid=self.params['W_lstm2_hi'],
                         W_cell=self.params['W_lstm2_ci'],
                         b=self.params['b_lstm2_i'],
                         nonlinearity=lasagne.nonlinearities.sigmoid)
        forget_gate = L.Gate(W_in=self.params['W_lstm2_xf'],
                             W_hid=self.params['W_lstm2_hf'],
                             W_cell=self.params['W_lstm2_cf'],
                             b=self.params['b_lstm2_f'],
                             nonlinearity=lasagne.nonlinearities.sigmoid)
        out_gate = L.Gate(W_in=self.params['W_lstm2_xo'],
                          W_hid=self.params['W_lstm2_ho'],
                          W_cell=self.params['W_lstm2_co'],
                          b=self.params['b_lstm2_o'],
                          nonlinearity=lasagne.nonlinearities.sigmoid)
        cell_gate = L.Gate(W_in=self.params['W_lstm2_xc'],
                           W_hid=self.params['W_lstm2_hc'],
                           W_cell=None,
                           b=self.params['b_lstm2_c'],
                           nonlinearity=lasagne.nonlinearities.tanh)
        l_fwd_2 = L.LSTMLayer(to_next_layer_noise,
                              NUM_HIDDEN,
                              ingate=in_gate,
                              forgetgate=forget_gate,
                              cell=cell_gate,
                              outgate=out_gate,
                              peepholes=True,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True)

        # slice final states of both lstm layers
        l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1)
        l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1)

        # g will be used to score the words based on their embeddings
        g = L.DenseLayer(L.concat([l_fwd_1_slice, l_fwd_2_slice], axis=1),
                         num_units=EMBED_DIM,
                         W=self.params['W_dense'],
                         b=self.params['b_dense'],
                         nonlinearity=lasagne.nonlinearities.tanh)

        ## get outputs
        #g_out = L.get_output(g) # B x D
        #g_out_val = L.get_output(g, deterministic=True) # B x D

        ## compute softmax probs
        #probs,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm),
        #                    outputs_info=None,
        #                    sequences=[g_out,docidx_var,docidx_mask],
        #                    non_sequences=self.params['W_emb'])
        #predicted_probs = probs.reshape(docidx_var.shape) # B x N
        #probs_val,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm),
        #                    outputs_info=None,
        #                    sequences=[g_out_val,docidx_var,docidx_mask],
        #                    non_sequences=self.params['W_emb'])
        #predicted_probs_val = probs_val.reshape(docidx_var.shape) # B x N
        #return predicted_probs, predicted_probs_val

        # W is shared with the lookup table
        l_out = L.DenseLayer(g,
                             num_units=vocab_size,
                             W=self.params['W_emb'].T,
                             nonlinearity=lasagne.nonlinearities.softmax,
                             b=None)
        return l_out
Ejemplo n.º 15
0
def build_model(vocab_size,
                doc_var,
                qry_var,
                doc_mask_var,
                qry_mask_var,
                W_init=lasagne.init.Normal()):

    l_doc_in = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
    l_qry_in = L.InputLayer(shape=(None, None, 1), input_var=qry_var)

    l_doc_embed = L.EmbeddingLayer(l_doc_in, vocab_size, EMBED_DIM, W=W_init)
    l_qry_embed = L.EmbeddingLayer(l_qry_in,
                                   vocab_size,
                                   EMBED_DIM,
                                   W=l_doc_embed.W)

    l_doc_mask = L.InputLayer(shape=(None, None), input_var=doc_mask_var)
    l_qry_mask = L.InputLayer(shape=(None, None), input_var=qry_mask_var)

    l_doc_fwd = L.LSTMLayer(l_doc_embed,
                            NUM_HIDDEN,
                            grad_clipping=GRAD_CLIP,
                            mask_input=l_doc_mask,
                            gradient_steps=GRAD_STEPS,
                            precompute_input=True)
    l_doc_bkd = L.LSTMLayer(l_doc_embed,
                            NUM_HIDDEN,
                            grad_clipping=GRAD_CLIP,
                            mask_input=l_doc_mask,
                            gradient_steps=GRAD_STEPS,
                            precompute_input=True,
                            backwards=True)
    l_qry_fwd = L.LSTMLayer(l_qry_embed,
                            NUM_HIDDEN,
                            grad_clipping=GRAD_CLIP,
                            mask_input=l_qry_mask,
                            gradient_steps=GRAD_STEPS,
                            precompute_input=True)
    l_qry_bkd = L.LSTMLayer(l_qry_embed,
                            NUM_HIDDEN,
                            grad_clipping=GRAD_CLIP,
                            mask_input=l_qry_mask,
                            gradient_steps=GRAD_STEPS,
                            precompute_input=True,
                            backwards=True)

    l_doc_fwd_slice = L.SliceLayer(l_doc_fwd, -1, 1)
    l_doc_bkd_slice = L.SliceLayer(l_doc_bkd, 0, 1)
    l_qry_fwd_slice = L.SliceLayer(l_qry_fwd, -1, 1)
    l_qry_bkd_slice = L.SliceLayer(l_qry_bkd, 0, 1)

    r = L.DenseLayer(L.ElemwiseSumLayer([l_doc_fwd_slice, l_doc_bkd_slice]),
                     num_units=NUM_HIDDEN,
                     nonlinearity=lasagne.nonlinearities.tanh)
    u = L.DenseLayer(L.ElemwiseSumLayer([l_qry_fwd_slice, l_qry_bkd_slice]),
                     num_units=NUM_HIDDEN,
                     nonlinearity=lasagne.nonlinearities.tanh)

    g = L.DenseLayer(L.concat([r, u], axis=1),
                     num_units=EMBED_DIM,
                     W=lasagne.init.GlorotNormal(),
                     nonlinearity=lasagne.nonlinearities.tanh)

    l_out = L.DenseLayer(g,
                         num_units=vocab_size,
                         W=l_doc_embed.W.T,
                         nonlinearity=lasagne.nonlinearities.softmax,
                         b=None)

    return l_out
Ejemplo n.º 16
0
    def get_char2word(self, ic, avg=False):
        suf = '_avg' if avg else ''
        ec = L.EmbeddingLayer(
            ic,
            self.args.vc,
            self.args.nc,
            name='ec' + suf,
            W=HeNormal() if not avg else Constant())  # (100, 24, 32, 16)
        ec.params[ec.W].remove('regularizable')

        if self.args.char_model == 'CNN':
            lds = L.dimshuffle(ec, (0, 3, 1, 2))  # (100, 16, 24, 32)
            ls = []
            for n in self.args.ngrams:
                lconv = L.Conv2DLayer(
                    lds,
                    self.args.nf, (1, n),
                    untie_biases=True,
                    W=HeNormal('relu') if not avg else Constant(),
                    name='conv_%d' % n + suf)  # (100, 64/4, 24, 32-n+1)
                lpool = L.MaxPool2DLayer(
                    lconv, (1, self.args.max_len - n + 1))  # (100, 64, 24, 1)
                lpool = L.flatten(lpool, outdim=3)  # (100, 16, 24)
                lpool = L.dimshuffle(lpool, (0, 2, 1))  # (100, 24, 16)
                ls.append(lpool)
            xc = L.concat(ls, axis=2)  # (100, 24, 64)
            return xc

        elif self.args.char_model == 'LSTM':
            ml = L.ExpressionLayer(
                ic, lambda x: T.neq(x, 0))  # mask layer (100, 24, 32)
            ml = L.reshape(ml, (-1, self.args.max_len))  # (2400, 32)

            gate_params = L.recurrent.Gate(W_in=Orthogonal(),
                                           W_hid=Orthogonal())
            cell_params = L.recurrent.Gate(W_in=Orthogonal(),
                                           W_hid=Orthogonal(),
                                           W_cell=None,
                                           nonlinearity=tanh)

            lstm_in = L.reshape(
                ec, (-1, self.args.max_len, self.args.nc))  # (2400, 32, 16)
            lstm_f = L.LSTMLayer(
                lstm_in,
                self.args.nw / 2,
                mask_input=ml,
                grad_clipping=10.,
                learn_init=True,
                peepholes=False,
                precompute_input=True,
                ingate=gate_params,
                forgetgate=gate_params,
                cell=cell_params,
                outgate=gate_params,
                # unroll_scan=True,
                only_return_final=True,
                name='forward' + suf)  # (2400, 64)
            lstm_b = L.LSTMLayer(
                lstm_in,
                self.args.nw / 2,
                mask_input=ml,
                grad_clipping=10.,
                learn_init=True,
                peepholes=False,
                precompute_input=True,
                ingate=gate_params,
                forgetgate=gate_params,
                cell=cell_params,
                outgate=gate_params,
                # unroll_scan=True,
                only_return_final=True,
                backwards=True,
                name='backward' + suf)  # (2400, 64)
            remove_reg(lstm_f)
            remove_reg(lstm_b)
            if avg:
                set_zero(lstm_f)
                set_zero(lstm_b)
            xc = L.concat([lstm_f, lstm_b], axis=1)  # (2400, 128)
            xc = L.reshape(xc,
                           (-1, self.args.sw, self.args.nw))  # (100, 24, 256)
            return xc
Ejemplo n.º 17
0
 def __init__(self, train_raw, test_raw, dim, mode, l2, l1,
              batch_norm, dropout, batch_size,
              ihm_C, los_C, ph_C, decomp_C,
              partition, nbins, **kwargs):
             
     print "==> not used params in network class:", kwargs.keys()
     self.train_raw = train_raw
     self.test_raw = test_raw
     
     self.dim = dim
     self.mode = mode
     self.l2 = l2
     self.l1 = l1
     self.batch_norm = batch_norm
     self.dropout = dropout
     self.batch_size = batch_size
     self.ihm_C = ihm_C
     self.los_C = los_C
     self.ph_C = ph_C
     self.decomp_C = decomp_C
     self.nbins = nbins
     
     if (partition == 'log'):
         self.get_bin = metrics.get_bin_log
         self.get_estimate = metrics.get_estimate_log
     else:
         assert self.nbins == 10
         self.get_bin = metrics.get_bin_custom
         self.get_estimate = metrics.get_estimate_custom
     
     self.train_batch_gen = self.get_batch_gen(self.train_raw)
     self.test_batch_gen = self.get_batch_gen(self.test_raw)    
     
     self.input_var = T.tensor3('X')
     self.input_lens = T.ivector('L')
     
     self.ihm_pos = T.ivector('ihm_pos')
     self.ihm_mask = T.ivector('ihm_mask')
     self.ihm_label = T.ivector('ihm_label')
     
     self.los_mask = T.imatrix('los_mask')
     self.los_label = T.matrix('los_label') # for regression
     #self.los_label = T.imatrix('los_label')
     
     self.ph_label = T.imatrix('ph_label')
     
     self.decomp_mask = T.imatrix('decomp_mask')
     self.decomp_label = T.imatrix('decomp_label')
     
     print "==> Building neural network"
     
     # common network
     network = layers.InputLayer((None, None, self.train_raw[0][0].shape[1]), 
                                 input_var=self.input_var)
     
     if (self.dropout > 0):
         network = layers.DropoutLayer(network, p=self.dropout)
     
     network = layers.LSTMLayer(incoming=network, num_units=dim,
                                only_return_final=False,
                                grad_clipping=10,
                                ingate=lasagne.layers.Gate(
                                     W_in=Orthogonal(),
                                     W_hid=Orthogonal(),
                                     W_cell=Normal(0.1)),
                                forgetgate=lasagne.layers.Gate(
                                     W_in=Orthogonal(),
                                     W_hid=Orthogonal(),
                                     W_cell=Normal(0.1)),
                                cell=lasagne.layers.Gate(W_cell=None,
                                     nonlinearity=lasagne.nonlinearities.tanh,
                                     W_in=Orthogonal(),
                                     W_hid=Orthogonal()),
                                outgate=lasagne.layers.Gate(
                                     W_in=Orthogonal(),
                                     W_hid=Orthogonal(),
                                     W_cell=Normal(0.1)))
     
     if (self.dropout > 0):
         network = layers.DropoutLayer(network, p=self.dropout)
     
     lstm_output = layers.get_output(network)
     self.params = layers.get_all_params(network, trainable=True)
     self.reg_params = layers.get_all_params(network, regularizable=True)
     
     # for each example in minibatch take the last output
     last_outputs = []
     for index in range(self.batch_size):
         last_outputs.append(lstm_output[index, self.input_lens[index]-1, :])
     last_outputs = T.stack(last_outputs)
     
     # take 48h outputs for fixed mortality task
     mid_outputs = []
     for index in range(self.batch_size):
         mid_outputs.append(lstm_output[index, self.ihm_pos[index], :])
     mid_outputs = T.stack(mid_outputs)
     
     
     # in-hospital mortality related network
     ihm_network = layers.InputLayer((None, dim), input_var=mid_outputs)
     ihm_network = layers.DenseLayer(incoming=ihm_network, num_units=2,
                                    nonlinearity=softmax)
     self.ihm_prediction = layers.get_output(ihm_network)
     self.ihm_det_prediction = layers.get_output(ihm_network, deterministic=True)
     self.params += layers.get_all_params(ihm_network, trainable=True)
     self.reg_params += layers.get_all_params(ihm_network, regularizable=True)
     self.ihm_loss = (self.ihm_mask * categorical_crossentropy(self.ihm_prediction, 
                                                       self.ihm_label)).mean()
     
     
     # length of stay related network
     # Regression
     los_network = layers.InputLayer((None, None, dim), input_var=lstm_output)
     los_network = layers.ReshapeLayer(los_network, (-1, dim))
     los_network = layers.DenseLayer(incoming=los_network, num_units=1,
                                     nonlinearity=rectify)
     los_network = layers.ReshapeLayer(los_network, (lstm_output.shape[0], -1))
     self.los_prediction = layers.get_output(los_network)
     self.los_det_prediction = layers.get_output(los_network, deterministic=True)
     self.params += layers.get_all_params(los_network, trainable=True)
     self.reg_params += layers.get_all_params(los_network, regularizable=True)
     self.los_loss = (self.los_mask * squared_error(self.los_prediction,
                                                   self.los_label)).mean(axis=1).mean(axis=0)
     
     
     # phenotype related network
     ph_network = layers.InputLayer((None, dim), input_var=last_outputs)
     ph_network = layers.DenseLayer(incoming=ph_network, num_units=25,
                                    nonlinearity=sigmoid)
     self.ph_prediction = layers.get_output(ph_network)
     self.ph_det_prediction = layers.get_output(ph_network, deterministic=True)
     self.params += layers.get_all_params(ph_network, trainable=True)
     self.reg_params += layers.get_all_params(ph_network, regularizable=True)
     self.ph_loss = nn_utils.multilabel_loss(self.ph_prediction, self.ph_label)
             
     
     # decompensation related network
     decomp_network = layers.InputLayer((None, None, dim), input_var=lstm_output)
     decomp_network = layers.ReshapeLayer(decomp_network, (-1, dim))
     decomp_network = layers.DenseLayer(incoming=decomp_network, num_units=2,
                                    nonlinearity=softmax)
     decomp_network = layers.ReshapeLayer(decomp_network, (lstm_output.shape[0], -1, 2))
     self.decomp_prediction = layers.get_output(decomp_network)[:, :, 1]
     self.decomp_det_prediction = layers.get_output(decomp_network, deterministic=True)[:, :, 1]
     self.params += layers.get_all_params(decomp_network, trainable=True)
     self.reg_params += layers.get_all_params(decomp_network, regularizable=True)
     self.decomp_loss = nn_utils.multilabel_loss_with_mask(self.decomp_prediction,
                                                       self.decomp_label,
                                                       self.decomp_mask)
     
     """
     data = next(self.train_batch_gen)
     print max(data[1])
     print lstm_output.eval({self.input_var:data[0]}).shape
     exit()
     """
     
     
     if self.l2 > 0: 
         self.loss_l2 = self.l2 * nn_utils.l2_reg(self.reg_params)
     else: 
         self.loss_l2 = T.constant(0)
     
     if self.l1 > 0: 
         self.loss_l1 = self.l1 * nn_utils.l1_reg(self.reg_params)
     else: 
         self.loss_l1 = T.constant(0)
     
     self.reg_loss = self.loss_l1 + self.loss_l2
     
     self.loss = (ihm_C * self.ihm_loss + los_C * self.los_loss + 
                  ph_C * self.ph_loss + decomp_C * self.decomp_loss + 
                  self.reg_loss)
           
     #updates = lasagne.updates.adadelta(self.loss, self.params,
     #                                    learning_rate=0.001)
     #updates = lasagne.updates.momentum(self.loss, self.params,
     #                                    learning_rate=0.00003)
     #updates = lasagne.updates.adam(self.loss, self.params)
     updates = lasagne.updates.adam(self.loss, self.params, beta1=0.5,
                                    learning_rate=0.0001) # from DCGAN paper
     #updates = lasagne.updates.nesterov_momentum(loss, params, momentum=0.9,
     #                                             learning_rate=0.001,
     
     all_inputs = [self.input_var, self.input_lens,
                   self.ihm_pos, self.ihm_mask, self.ihm_label,
                   self.los_mask, self.los_label,
                   self.ph_label,
                   self.decomp_mask, self.decomp_label]
     
     train_outputs = [self.ihm_prediction, self.los_prediction,
                      self.ph_prediction, self.decomp_prediction,
                      self.loss,
                      self.ihm_loss, self.los_loss,
                      self.ph_loss, self.decomp_loss,
                      self.reg_loss]
                      
     test_outputs = [self.ihm_det_prediction, self.los_det_prediction,
                     self.ph_det_prediction, self.decomp_det_prediction,
                     self.loss,
                     self.ihm_loss, self.los_loss,
                     self.ph_loss, self.decomp_loss,
                     self.reg_loss]
     
     ## compiling theano functions
     if self.mode == 'train':
         print "==> compiling train_fn"
         self.train_fn = theano.function(inputs=all_inputs,
                                         outputs=train_outputs,
                                         updates=updates)
     
     print "==> compiling test_fn"
     self.test_fn = theano.function(inputs=all_inputs,
                                    outputs=test_outputs)
Ejemplo n.º 18
0
def build_model(vmap,
                nclasses=2,
                embedding_dim=50,
                nhidden=256,
                batchsize=None,
                invar=None,
                maskvar=None,
                bidirectional=True,
                pool=True,
                grad_clip=100,
                maxlen=MAXLEN):

    V = len(vmap)
    W = lasagne.init.Normal()

    # Input Layer
    # TODO: should be (batchsize, maxlen, vocab_size)
    l_in = layer.InputLayer((batchsize, maxlen, V), input_var=invar)
    l_mask = layer.InputLayer((batchsize, maxlen), input_var=maskvar)
    ASSUME = {l_in: (200, 140, 94), l_mask: (200, 140)}
    print 'Input Layer'
    print 'output:', get_output_shape(l_in, ASSUME)
    print 'output(mask):', get_output_shape(l_mask, ASSUME)
    print

    # Embedding Layer
    l_emb = layer.EmbeddingLayer(l_in,
                                 input_size=V,
                                 output_size=embedding_dim,
                                 W=W)
    print 'Embedding Layer'
    print 'output:', get_output_shape(l_emb, ASSUME)

    gate_params = layer.recurrent.Gate(W_in=lasagne.init.Orthogonal(),
                                       W_hid=lasagne.init.Orthogonal(),
                                       b=lasagne.init.Constant(0.))

    cell_params = layer.recurrent.Gate(
        W_in=lasagne.init.Orthogonal(),
        W_hid=lasagne.init.Orthogonal(),
        W_cell=None,
        b=lasagne.init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.tanh)

    l_fwd = layer.LSTMLayer(l_emb,
                            num_units=nhidden,
                            grad_clipping=grad_clip,
                            nonlinearity=lasagne.nonlinearities.tanh,
                            mask_input=l_mask,
                            ingate=gate_params,
                            forgetgate=gate_params,
                            cell=cell_params,
                            outgate=gate_params,
                            learn_init=True)

    print 'Forward LSTM'
    print 'output:', get_output_shape(l_fwd, ASSUME)

    l_concat = None
    if bidirectional:
        l_bwd = layer.LSTMLayer(l_emb,
                                num_units=nhidden,
                                grad_clipping=grad_clip,
                                nonlinearity=lasagne.nonlinearities.tanh,
                                mask_input=l_mask,
                                ingate=gate_params,
                                forgetgate=gate_params,
                                cell=cell_params,
                                outgate=gate_params,
                                learn_init=True,
                                backwards=True)
        print 'Backward LSTM'
        print 'output:', get_output_shape(l_bwd, ASSUME)

        def tmean(a, b):
            agg = theano.tensor.add(a, b)
            agg /= 2.
            return agg

        if pool:
            l_concat = layer.ElemwiseMergeLayer([l_fwd, l_bwd], tmean)
        else:
            l_concat = layer.ConcatLayer([l_fwd, l_bwd])
    else:
        l_concat = layer.ConcatLayer([l_fwd])
    print 'Concat'
    print 'output:', get_output_shape(l_concat, ASSUME)

    l_concat = layer.DropoutLayer(l_concat, p=0.5)

    l_lstm2 = layer.LSTMLayer(l_concat,
                              num_units=nhidden,
                              grad_clipping=grad_clip,
                              nonlinearity=lasagne.nonlinearities.tanh,
                              mask_input=l_mask,
                              ingate=gate_params,
                              forgetgate=gate_params,
                              cell=cell_params,
                              outgate=gate_params,
                              learn_init=True,
                              only_return_final=True)

    print 'LSTM #2'
    print 'output:', get_output_shape(l_lstm2, ASSUME)

    l_lstm2 = layer.DropoutLayer(l_lstm2, p=0.6)

    network = layer.DenseLayer(l_lstm2,
                               num_units=nclasses,
                               nonlinearity=lasagne.nonlinearities.softmax)

    print 'Dense Layer'
    print 'output:', get_output_shape(network, ASSUME)

    return network
Ejemplo n.º 19
0
    def _build_net(self, emb_char_filter_size=5, emb_dropout=True, **kwargs):

        batch_size = self.mask_context_var.shape[0]
        context_len = self.mask_context_var.shape[1]
        question_len = self.question_var.shape[1]
        context_word_len = self.context_char_var.shape[2]
        question_word_len = self.question_char_var.shape[2]

        self.batch_size = batch_size
        self.context_len = context_len
        ''' Inputs and word embeddings'''

        l_context_char = LL.InputLayer(shape=(None, None, None),
                                       input_var=self.context_char_var)
        l_question_char = LL.InputLayer(shape=(None, None, None),
                                        input_var=self.question_char_var)

        l_c_mask = LL.InputLayer(shape=(None, None),
                                 input_var=self.mask_context_var)
        l_q_mask = LL.InputLayer(shape=(None, None),
                                 input_var=self.mask_question_var)

        l_c_char_mask = LL.InputLayer(shape=(None, None, None),
                                      input_var=self.mask_context_char_var)
        l_q_char_mask = LL.InputLayer(shape=(None, None, None),
                                      input_var=self.mask_question_char_var)

        l_c_emb = LL.InputLayer(shape=(None, None, self.emb_size),
                                input_var=self.context_var)
        l_q_emb = LL.InputLayer(shape=(None, None, self.emb_size),
                                input_var=self.question_var)

        if self.train_unk:
            l_c_unk_mask = LL.InputLayer(shape=(None, None),
                                         input_var=self.mask_context_unk_var)
            l_q_unk_mask = LL.InputLayer(shape=(None, None),
                                         input_var=self.mask_question_unk_var)

            l_c_emb = TrainUnkLayer(l_c_emb,
                                    l_c_unk_mask,
                                    output_size=self.emb_size,
                                    W=self.word_embeddings[0])

            l_q_emb = TrainUnkLayer(l_q_emb,
                                    l_q_unk_mask,
                                    output_size=self.emb_size,
                                    W=l_c_emb.W)

        if self.negative:
            l_c_emb = TrainNAWLayer(l_c_emb,
                                    l_c_mask,
                                    output_size=self.emb_size)
        ''' Char-embeddings '''

        # (batch_size x context_len x context_word_len x emb_char_size)
        l_c_char_emb = LL.EmbeddingLayer(l_context_char,
                                         input_size=self.alphabet_size,
                                         output_size=self.emb_char_size)

        l_q_char_emb = LL.EmbeddingLayer(l_question_char,
                                         input_size=self.alphabet_size,
                                         output_size=self.emb_char_size,
                                         W=l_c_char_emb.W)

        # here I do multiplication of character embeddings with masks,
        # because I want to pad them with constant zeros

        l_c_char_mask = ForgetSizeLayer(
            LL.dimshuffle(l_c_char_mask, (0, 1, 2, 'x')))
        l_q_char_mask = ForgetSizeLayer(
            LL.dimshuffle(l_q_char_mask, (0, 1, 2, 'x')))

        l_c_char_emb = LL.ElemwiseMergeLayer([l_c_char_emb, l_c_char_mask],
                                             T.mul)
        l_q_char_emb = LL.ElemwiseMergeLayer([l_q_char_emb, l_q_char_mask],
                                             T.mul)

        # convolutions

        l_c_char_emb = LL.dimshuffle(
            LL.reshape(l_c_char_emb, (batch_size * context_len,
                                      context_word_len, self.emb_char_size)),
            (0, 2, 1))
        l_c_char_conv = LL.Conv1DLayer(l_c_char_emb,
                                       num_filters=self.num_emb_char_filters,
                                       filter_size=emb_char_filter_size,
                                       nonlinearity=L.nonlinearities.tanh,
                                       pad=self.conv)
        # (batch_size * context_len x num_filters x context_word_len + filter_size - 1)

        l_c_char_emb = LL.ExpressionLayer(l_c_char_conv,
                                          lambda X: X.max(2),
                                          output_shape='auto')
        l_c_char_emb = LL.reshape(
            l_c_char_emb, (batch_size, context_len, self.num_emb_char_filters))

        l_q_char_emb = LL.dimshuffle(
            LL.reshape(l_q_char_emb, (batch_size * question_len,
                                      question_word_len, self.emb_char_size)),
            (0, 2, 1))
        l_q_char_conv = LL.Conv1DLayer(l_q_char_emb,
                                       num_filters=self.num_emb_char_filters,
                                       filter_size=emb_char_filter_size,
                                       nonlinearity=L.nonlinearities.tanh,
                                       W=l_c_char_conv.W,
                                       b=l_c_char_conv.b,
                                       pad=self.conv)
        # (batch_size * question_len x num_filters x question_word_len + filter_size - 1)

        l_q_char_emb = LL.ExpressionLayer(l_q_char_conv,
                                          lambda X: X.max(2),
                                          output_shape='auto')
        l_q_char_emb = LL.reshape(
            l_q_char_emb,
            (batch_size, question_len, self.num_emb_char_filters))
        ''' Concatenating both embeddings '''

        l_c_emb = LL.concat([l_c_emb, l_c_char_emb], axis=2)
        l_q_emb = LL.concat([l_q_emb, l_q_char_emb], axis=2)

        # originally I had dropout here
        ''' Highway layer allowing for interaction between embeddings '''

        l_c_P = LL.reshape(l_c_emb,
                           (batch_size * context_len,
                            self.emb_size + self.num_emb_char_filters))
        l_c_P = LL.DenseLayer(l_c_P,
                              num_units=self.rec_size,
                              b=None,
                              nonlinearity=None)

        l_c_high = HighwayLayer(l_c_P)
        l_c_emb = LL.reshape(l_c_high,
                             (batch_size, context_len, self.rec_size))

        l_q_P = LL.reshape(l_q_emb,
                           (batch_size * question_len,
                            self.emb_size + self.num_emb_char_filters))
        l_q_P = LL.DenseLayer(l_q_P,
                              num_units=self.rec_size,
                              W=l_c_P.W,
                              b=None,
                              nonlinearity=None)

        l_q_high = HighwayLayer(l_q_P,
                                W1=l_c_high.W1,
                                b1=l_c_high.b1,
                                W2=l_c_high.W2,
                                b2=l_c_high.b2)
        l_q_emb = LL.reshape(l_q_high,
                             (batch_size, question_len, self.rec_size))
        ''' Calculating wiq features from https://arxiv.org/abs/1703.04816 '''

        l_weighted_feat = WeightedFeatureLayer(
            [l_c_emb, l_q_emb, l_c_mask, l_q_mask])  # batch_size x context_len
        l_weighted_feat = LL.dimshuffle(l_weighted_feat, (0, 1, 'x'))

        # batch_size x context_len
        l_bin_feat = LL.InputLayer(shape=(None, None),
                                   input_var=self.bin_feat_var)
        l_bin_feat = LL.dimshuffle(l_bin_feat, (0, 1, 'x'))
        ''' Dropout at the embeddings '''

        if emb_dropout:
            print('Using dropout after wiq calculation.')
            l_c_emb = LL.dropout(l_c_emb)
            l_q_emb = LL.dropout(l_q_emb)
        ''' Here we concatenate wiq features to embeddings'''

        # both features are concatenated to the embeddings
        # for the question we fix the features to 1
        l_c_emb = LL.concat([l_c_emb, l_bin_feat, l_weighted_feat], axis=2)
        l_q_emb = LL.pad(l_q_emb,
                         width=[(0, 2)],
                         val=L.utils.floatX(1),
                         batch_ndim=2)
        ''' Context and question encoding using the same BiLSTM for both '''

        # output shape is (batch_size x context_len x rec_size)
        l_c_enc_forw = LL.LSTMLayer(l_c_emb,
                                    num_units=self.rec_size,
                                    grad_clipping=100,
                                    mask_input=l_c_mask)

        l_c_enc_back = LL.LSTMLayer(l_c_emb,
                                    num_units=self.rec_size,
                                    grad_clipping=100,
                                    mask_input=l_c_mask,
                                    backwards=True)

        # output shape is (batch_size x question_len x rec_size)
        l_q_enc_forw = LL.LSTMLayer(
            l_q_emb,
            num_units=self.rec_size,
            grad_clipping=100,
            mask_input=l_q_mask,
            ingate=LL.Gate(W_in=l_c_enc_forw.W_in_to_ingate,
                           W_hid=l_c_enc_forw.W_hid_to_ingate,
                           W_cell=l_c_enc_forw.W_cell_to_ingate,
                           b=l_c_enc_forw.b_ingate),
            forgetgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_forgetgate,
                               W_hid=l_c_enc_forw.W_hid_to_forgetgate,
                               W_cell=l_c_enc_forw.W_cell_to_forgetgate,
                               b=l_c_enc_forw.b_forgetgate),
            outgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_outgate,
                            W_hid=l_c_enc_forw.W_hid_to_outgate,
                            W_cell=l_c_enc_forw.W_cell_to_outgate,
                            b=l_c_enc_forw.b_outgate),
            cell=LL.Gate(W_in=l_c_enc_forw.W_in_to_cell,
                         W_hid=l_c_enc_forw.W_hid_to_cell,
                         W_cell=None,
                         b=l_c_enc_forw.b_cell,
                         nonlinearity=L.nonlinearities.tanh))

        l_q_enc_back = LL.LSTMLayer(
            l_q_emb,
            num_units=self.rec_size,
            grad_clipping=100,
            mask_input=l_q_mask,
            backwards=True,
            ingate=LL.Gate(W_in=l_c_enc_back.W_in_to_ingate,
                           W_hid=l_c_enc_back.W_hid_to_ingate,
                           W_cell=l_c_enc_back.W_cell_to_ingate,
                           b=l_c_enc_back.b_ingate),
            forgetgate=LL.Gate(W_in=l_c_enc_back.W_in_to_forgetgate,
                               W_hid=l_c_enc_back.W_hid_to_forgetgate,
                               W_cell=l_c_enc_back.W_cell_to_forgetgate,
                               b=l_c_enc_back.b_forgetgate),
            outgate=LL.Gate(W_in=l_c_enc_back.W_in_to_outgate,
                            W_hid=l_c_enc_back.W_hid_to_outgate,
                            W_cell=l_c_enc_back.W_cell_to_outgate,
                            b=l_c_enc_back.b_outgate),
            cell=LL.Gate(W_in=l_c_enc_back.W_in_to_cell,
                         W_hid=l_c_enc_back.W_hid_to_cell,
                         W_cell=None,
                         b=l_c_enc_back.b_cell,
                         nonlinearity=L.nonlinearities.tanh))

        # batch_size x context_len  x 2*rec_size
        l_c_enc = LL.concat([l_c_enc_forw, l_c_enc_back], axis=2)
        # batch_size x question_len x 2*rec_size
        l_q_enc = LL.concat([l_q_enc_forw, l_q_enc_back], axis=2)

        def proj_init():
            return np.vstack([
                np.eye(self.rec_size, dtype=theano.config.floatX),
                np.eye(self.rec_size, dtype=theano.config.floatX)
            ])

        # this is H from the paper, shape: (batch_size * context_len x
        # rec_size)
        l_c_proj = LL.reshape(l_c_enc,
                              (batch_size * context_len, 2 * self.rec_size))
        l_c_proj = LL.DenseLayer(l_c_proj,
                                 num_units=self.rec_size,
                                 W=proj_init(),
                                 b=None,
                                 nonlinearity=L.nonlinearities.tanh)

        # this is Z from the paper, shape: (batch_size * question_len x
        # rec_size)
        l_q_proj = LL.reshape(l_q_enc,
                              (batch_size * question_len, 2 * self.rec_size))
        l_q_proj = LL.DenseLayer(l_q_proj,
                                 num_units=self.rec_size,
                                 W=proj_init(),
                                 b=None,
                                 nonlinearity=L.nonlinearities.tanh)
        ''' Additional, weighted question encoding (alphas from paper) '''

        l_alpha = LL.DenseLayer(
            l_q_proj,  # batch_size * question_len x 1
            num_units=1,
            b=None,
            nonlinearity=None)

        # batch_size x question_len
        l_alpha = MaskedSoftmaxLayer(
            LL.reshape(l_alpha, (batch_size, question_len)), l_q_mask)

        # batch_size x rec_size
        l_z_hat = BatchedDotLayer([
            LL.reshape(l_q_proj, (batch_size, question_len, self.rec_size)),
            l_alpha
        ])

        return l_c_proj, l_z_hat
def build_model(hyparams,
                vocab,
                nclasses=2,
                batchsize=None,
                invar=None,
                maskvar=None,
                maxlen=MAXLEN):

    embedding_dim = hyparams.embedding_dim
    nhidden = hyparams.nhidden
    bidirectional = hyparams.bidirectional
    pool = hyparams.pool
    grad_clip = hyparams.grad_clip
    init = hyparams.init

    net = OrderedDict()

    V = len(vocab)
    W = lasagne.init.Normal()

    gate_params = layer.recurrent.Gate(
        W_in=lasagne.init.Orthogonal(),
        W_hid=lasagne.init.Orthogonal(),
        b=lasagne.init.Constant(0.)
    )
    cell_params = layer.recurrent.Gate(
        W_in=lasagne.init.Orthogonal(),
        W_hid=lasagne.init.Orthogonal(),
        W_cell=None,
        b=lasagne.init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.tanh
    )

    # define model
    net['input'] = layer.InputLayer((batchsize, maxlen), input_var=invar)
    net['mask'] = layer.InputLayer((batchsize, maxlen), input_var=maskvar)
    net['emb'] = layer.EmbeddingLayer(net['input'], input_size=V, output_size=embedding_dim, W=W)
    net['fwd1'] = layer.LSTMLayer(
        net['emb'],
        num_units=nhidden,
        grad_clipping=grad_clip,
        nonlinearity=lasagne.nonlinearities.tanh,
        mask_input=net['mask'],
        ingate=gate_params,
        forgetgate=gate_params,
        cell=cell_params,
        outgate=gate_params,
        learn_init=True
    )
    if bidirectional:
        net['bwd1'] = layer.LSTMLayer(
            net['emb'],
            num_units=nhidden,
            grad_clipping=grad_clip,
            nonlinearity=lasagne.nonlinearities.tanh,
            mask_input=net['mask'],
            ingate=gate_params,
            forgetgate=gate_params,
            cell=cell_params,
            outgate=gate_params,
            learn_init=True,
            backwards=True
        )

        def tmean(a, b):
            agg = theano.tensor.add(a, b)
            agg /= 2.
            return agg

        net['pool'] = layer.ElemwiseMergeLayer([net['fwd1'], net['bwd1']], tmean)
    else:
        net['pool'] = layer.ConcatLayer([net['fwd1']])
    net['dropout1'] = layer.DropoutLayer(net['pool'], p=0.5)
    net['fwd2'] = layer.LSTMLayer(
        net['dropout1'],
        num_units=nhidden,
        grad_clipping=grad_clip,
        nonlinearity=lasagne.nonlinearities.tanh,
        mask_input=net['mask'],
        ingate=gate_params,
        forgetgate=gate_params,
        cell=cell_params,
        outgate=gate_params,
        learn_init=True,
        only_return_final=True
    )
    net['dropout2'] = layer.DropoutLayer(net['fwd2'], p=0.6)
    net['softmax'] = layer.DenseLayer(
        net['dropout2'],
        num_units=nclasses,
        nonlinearity=lasagne.nonlinearities.softmax
    )
    ASSUME = {net['input']: (200, 140), net['mask']: (200, 140)}
    logstr = '========== MODEL ========== \n'
    logstr += 'vocab size: %d\n' % V
    logstr += 'embedding dim: %d\n' % embedding_dim
    logstr += 'nhidden: %d\n' % nhidden
    logstr += 'pooling: %s\n' % pool
    for lname, lyr in net.items():
        logstr += '%s %s\n' % (lname, str(get_output_shape(lyr, ASSUME)))
    logstr += '=========================== \n'
    print logstr
    return net
Ejemplo n.º 21
0
    def buildModel(self):
        print(' -- Building...')
        x_init = sparse.csr_matrix('x', dtype='float32')
        y_init = T.imatrix('y')
        g_init = T.imatrix('g')
        ind_init = T.ivector('ind')
        sub_path_init = T.imatrix('subPathsBatch')
        mask_init = T.fmatrix('subMask')

        # step train
        x_input = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                 input_var=x_init)
        g_input = lgl.InputLayer(shape=(None, 2), input_var=g_init)
        ind_input = lgl.InputLayer(shape=(None, ), input_var=ind_init)
        pair_second = lgl.SliceLayer(g_input, indices=1, axis=1)
        pair_first = lgl.SliceLayer(g_input, indices=0, axis=1)
        pair_first_emd = lgl.EmbeddingLayer(pair_first,
                                            input_size=self.num_ver,
                                            output_size=self.embedding_size)
        emd_to_numver = layers.DenseLayer(
            pair_first_emd,
            self.num_ver,
            nonlinearity=lg.nonlinearities.softmax)
        index_emd = lgl.EmbeddingLayer(ind_input,
                                       input_size=self.num_ver,
                                       output_size=self.embedding_size,
                                       W=pair_first_emd.W)
        x_to_ydim = layers.SparseLayer(x_input,
                                       self.y.shape[1],
                                       nonlinearity=lg.nonlinearities.softmax)
        index_emd = layers.DenseLayer(index_emd,
                                      self.y.shape[1],
                                      nonlinearity=lg.nonlinearities.softmax)
        concat_two = lgl.ConcatLayer([x_to_ydim, index_emd], axis=1)
        concat_two = layers.DenseLayer(concat_two,
                                       self.y.shape[1],
                                       nonlinearity=lg.nonlinearities.softmax)
        concat_two_output = lgl.get_output(concat_two)
        step_loss = lgo.categorical_crossentropy(concat_two_output,
                                                 y_init).mean()
        hid_loss = lgl.get_output(x_to_ydim)
        step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean()
        emd_loss = lgl.get_output(index_emd)
        step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean()
        step_params = [
            index_emd.W, index_emd.b, x_to_ydim.W, x_to_ydim.b, concat_two.W,
            concat_two.b
        ]
        step_updates = lg.updates.sgd(step_loss,
                                      step_params,
                                      learning_rate=self.step_learning_rate)
        self.step_train = theano.function([x_init, y_init, ind_init],
                                          step_loss,
                                          updates=step_updates,
                                          on_unused_input='ignore')
        self.test_fn = theano.function([x_init, ind_init],
                                       concat_two_output,
                                       on_unused_input='ignore')

        # supervised train
        fc_output = lgl.get_output(emd_to_numver)
        pair_second_output = lgl.get_output(pair_second)
        sup_loss = lgo.categorical_crossentropy(fc_output,
                                                pair_second_output).sum()
        sup_params = lgl.get_all_params(emd_to_numver, trainable=True)
        sup_updates = lg.updates.sgd(sup_loss,
                                     sup_params,
                                     learning_rate=self.sup_learning_rate)
        self.sup_train = theano.function([g_init],
                                         sup_loss,
                                         updates=sup_updates,
                                         on_unused_input='ignore')

        cross_entropy = lgo.categorical_crossentropy(fc_output,
                                                     pair_second_output)
        cross_entropy = T.reshape(cross_entropy, (1, self.unsup_batch_size),
                                  ndim=None)

        mask_input = lgl.InputLayer(shape=(None, self.window_size + 1),
                                    input_var=mask_init)
        subPath_in = lgl.InputLayer(shape=(None, self.window_size + 1),
                                    input_var=sub_path_init)
        sub_path_emd = lgl.EmbeddingLayer(subPath_in,
                                          input_size=self.num_ver,
                                          output_size=self.embedding_size,
                                          W=pair_first_emd.W)

        lstm_layer = lgl.LSTMLayer(sub_path_emd,
                                   self.lstm_hidden_units,
                                   grad_clipping=3,
                                   mask_input=mask_input)

        # handle path weight
        max1 = T.mean(lgl.get_output(lstm_layer), axis=1)
        max2 = T.mean(max1, axis=1)
        max2_init = T.fcol('max2')
        max2_init = T.reshape(max2, ((self.subpath_num, 1)))
        max2_input = lgl.InputLayer(shape=(self.subpath_num, 1),
                                    input_var=max2_init)
        max2_input = lgl.BatchNormLayer(max2_input)
        path_weight = lgl.get_output(max2_input)
        path_weight = lg.nonlinearities.sigmoid(path_weight)
        path_weight = 1 + 0.3 * path_weight

        # unsupervised train
        reweight_loss = T.dot(cross_entropy, path_weight)[0][0]
        lstm_params_all = lgl.get_all_params(lstm_layer, trainable=True)
        lstm_params = list(set(lstm_params_all).difference(set(sup_params)))
        lstm_updates = lg.updates.sgd(reweight_loss,
                                      lstm_params,
                                      learning_rate=0.01)
        self.lstm_fn = theano.function([sub_path_init, g_init, mask_init],
                                       reweight_loss,
                                       updates=lstm_updates,
                                       on_unused_input='ignore')
        alpha_updates = lg.updates.sgd(reweight_loss,
                                       sup_params,
                                       learning_rate=0.001)
        self.alpha_fn = theano.function([sub_path_init, g_init, mask_init],
                                        reweight_loss,
                                        updates=alpha_updates,
                                        on_unused_input='ignore')
        print(' -- Done!')
Ejemplo n.º 22
0
    def build_network(self, V, C, W, dv, qv, tv, dmv, qmv, fv):
        # inputs
        l_docin = L.InputLayer(shape=(None, None), input_var=dv)
        l_qin = L.InputLayer(shape=(None, None), input_var=qv)
        l_docmask = L.InputLayer(shape=(None, None), input_var=dmv)
        l_qmask = L.InputLayer(shape=(None, None), input_var=qmv)
        l_featin = L.InputLayer(shape=(None, None), input_var=fv)
        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=V,
                                      output_size=EMBED_DIM,
                                      W=W)  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=V,
                                    output_size=EMBED_DIM,
                                    W=l_docembed.W)  # B x Q x DE
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        # question lstm
        l_q_lstm = L.LSTMLayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, \
                gradient_steps=GRAD_STEPS, precompute_input=True) # B x Q x D
        l_q_lstm = L.dropout(l_q_lstm, p=DROPOUT_RATE)
        l_q_att_in = L.ReshapeLayer(
            l_q_lstm, (qv.shape[0] * qv.shape[1], NUM_HIDDEN))  # BQ x D
        l_q_att_1 = L.DenseLayer(l_q_att_in, NUM_HIDDEN, b=None, \
                nonlinearity=lasagne.nonlinearities.tanh) # BQ x D
        l_q_att_2 = L.DenseLayer(l_q_att_1, 1, b=None,
                                 nonlinearity=None)  # BQ x 1
        l_q_att_out = L.ReshapeLayer(l_q_att_2,
                                     (qv.shape[0], qv.shape[1]))  # B x Q
        q = L.get_output(l_q_lstm)
        alphas = T.nnet.softmax(L.get_output(l_q_att_out)) * qmv  # B x Q
        alphas = alphas / alphas.sum(axis=1)[:, np.newaxis]
        rq = (alphas[:, :, np.newaxis] * q).sum(axis=1)  # B x D

        # evidence lstm
        rq_tiled = T.reshape(T.tile(rq, (1, dv.shape[1])),
                             (dv.shape[0], dv.shape[1], NUM_HIDDEN))
        l_rq_in = L.InputLayer(shape=(None, None, NUM_HIDDEN),
                               input_var=rq_tiled)  # B x N x D
        l_ev = L.ConcatLayer([l_docembed, l_rq_in, l_fembed],
                             axis=2)  # B x N x (DE+D+2)
        l_ev_lstm1 = L.LSTMLayer(l_ev, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, \
                gradient_steps=GRAD_STEPS, precompute_input=True) # B x N x D
        l_ev_lstm1 = L.dropout(l_ev_lstm1, p=DROPOUT_RATE)
        l_ev_lstm2 = L.LSTMLayer(l_ev_lstm1, NUM_HIDDEN, grad_clipping=GRAD_CLIP, \
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                backwards=True) # B x N x D
        l_ev_lstm2 = L.dropout(l_ev_lstm2, p=DROPOUT_RATE)
        l_ev_lstm3 = L.LSTMLayer(L.ConcatLayer([l_ev_lstm1,l_ev_lstm2], axis=2), NUM_HIDDEN, \
                grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, \
                precompute_input=True) # B x N x D
        l_ev_lstm3 = L.dropout(l_ev_lstm3, p=DROPOUT_RATE)

        # crf
        l_class_in = L.ReshapeLayer(
            l_ev_lstm3, (dv.shape[0] * dv.shape[1], NUM_HIDDEN))  # BN x D
        l_class = L.DenseLayer(l_class_in, C, b=None,
                               nonlinearity=None)  # BN x C
        l_crf_in = L.ReshapeLayer(l_class,
                                  (dv.shape[0], dv.shape[1], C))  # B x N x C
        l_crf = CRFLayer(l_crf_in, C, mask_input=dmv, label_input=tv, normalize=False, \
                end_points=True) # 1
        l_crfdecode = CRFDecodeLayer(l_crf_in, C, W_sim=l_crf.W_sim, \
                W_end_points=l_crf.W_end_points, mask_input=dmv) # B x N

        # params
        self.e_net = l_crf
        self.q_net = l_q_att_out
        params = L.get_all_params([self.e_net, self.q_net], trainable=True)

        return L.get_output(l_crf), params, L.get_output(l_crfdecode,
                                                         deterministic=True)
Ejemplo n.º 23
0
    def __init__(self, train_raw, test_raw, dim, mode, l2, l1,
                 batch_norm, dropout, batch_size, **kwargs):
                
        print "==> not used params in network class:", kwargs.keys()
        self.train_raw = train_raw
        self.test_raw = test_raw
        
        self.dim = dim
        self.mode = mode
        self.l2 = l2
        self.l1 = l1
        self.batch_norm = batch_norm
        self.dropout = dropout
        self.batch_size = batch_size
        
        self.train_batch_gen = self.get_batch_gen(self.train_raw)
        self.test_batch_gen = self.get_batch_gen(self.test_raw)    
        
        self.input_var = T.tensor3('X')
        self.input_lens = T.ivector('L')
        self.target_var = T.imatrix('y')
        
        """
        for i in range(700//self.batch_size):
            ret=next(self.train_batch_gen)
            print len(ret[0])
            print ret[0][0].shape
            print len(ret[1])
            print type(ret[1][0])
            print "---"
        exit()
        """
                
        print "==> Building neural network"
        network = layers.InputLayer((None, None, self.train_raw[0][0].shape[1]), 
                                    input_var=self.input_var)
        
        #print "!!!!!!!!!!! WARNING: dropout on input is disabled !!!!!!!!!!!!!!!!"
        if (self.dropout > 0):
            network = layers.DropoutLayer(network, p=self.dropout)

        network = layers.LSTMLayer(incoming=network, num_units=dim,
                                   grad_clipping=10,
                                   ingate=lasagne.layers.Gate(
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal(),
                                        W_cell=Normal(0.1)),
                                   forgetgate=lasagne.layers.Gate(
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal(),
                                        W_cell=Normal(0.1)),
                                   cell=lasagne.layers.Gate(W_cell=None,
                                        nonlinearity=lasagne.nonlinearities.tanh,
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal()),
                                   outgate=lasagne.layers.Gate(
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal(),
                                        W_cell=Normal(0.1)))
        
        if (self.dropout > 0):
            network = layers.DropoutLayer(network, p=self.dropout)
        
        network = layers.LSTMLayer(incoming=network, num_units=dim,
                                   only_return_final=False,
                                   grad_clipping=10,
                                   ingate=lasagne.layers.Gate(
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal(),
                                        W_cell=Normal(0.1)),
                                   forgetgate=lasagne.layers.Gate(
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal(),
                                        W_cell=Normal(0.1)),
                                   cell=lasagne.layers.Gate(W_cell=None,
                                        nonlinearity=lasagne.nonlinearities.tanh,
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal()),
                                   outgate=lasagne.layers.Gate(
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal(),
                                        W_cell=Normal(0.1)))
              
        lstm_output = layers.get_output(network)
        self.params = layers.get_all_params(network, trainable=True)
        self.reg_params = layers.get_all_params(network, regularizable=True)
        
        """
        data = next(self.train_batch_gen)
        print max(data[1])
        print lstm_output.eval({self.input_var:data[0]}).shape
        exit()
        """
        
        # for each example in minibatch take the last output
        last_outputs = []
        for index in range(self.batch_size):
            last_outputs.append(lstm_output[index, self.input_lens[index]-1, :])
        last_outputs = T.stack(last_outputs)

        """
        data = next(self.train_batch_gen)
        print max(data[1])
        print last_outputs.eval({self.input_var:data[0],
            self.input_lens:data[1],
        }).shape
        exit()
        """
        
        network = layers.InputLayer(shape=(self.batch_size, self.dim), 
                                    input_var=last_outputs)
        if (self.dropout > 0):
            network = layers.DropoutLayer(network, p=self.dropout)
        network = layers.DenseLayer(incoming=network,
                                    num_units=train_raw[1][0].shape[0],
                                    nonlinearity=sigmoid)
        
        self.prediction = layers.get_output(network)
        self.det_prediction = layers.get_output(network, deterministic=True)
        self.params += layers.get_all_params(network, trainable=True)
        self.reg_params += layers.get_all_params(network, regularizable=True)
        
        self.loss_multilabel = -(self.target_var * T.log(self.prediction) + \
            (1 - self.target_var) * T.log(1 - self.prediction)).mean(axis=1)\
                                                               .mean(axis=0)
        
        if self.l2 > 0: 
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.reg_params)
        else: 
            self.loss_l2 = 0
        
        if self.l1 > 0: 
            self.loss_l1 = self.l1 * nn_utils.l1_reg(self.reg_params)
        else: 
            self.loss_l1 = 0
            
        self.loss = self.loss_multilabel + self.loss_l2 + self.loss_l1
              
        #updates = lasagne.updates.adadelta(self.loss, self.params,
        #                                    learning_rate=0.001)
        #updates = lasagne.updates.momentum(self.loss, self.params,
        #                                    learning_rate=0.00003)
        #updates = lasagne.updates.adam(self.loss, self.params)
        updates = lasagne.updates.adam(self.loss, self.params, beta1=0.5,
                                       learning_rate=0.0001) # from DCGAN paper
        #updates = lasagne.updates.nesterov_momentum(loss, params, momentum=0.9,
        #                                             learning_rate=0.001,
        
        ## compiling theano functions
        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(inputs=[self.input_var,
                                                    self.input_lens,
                                                    self.target_var],
                                            outputs=[self.prediction, self.loss],
                                            updates=updates)
        
        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[self.input_var,
                                               self.input_lens,
                                               self.target_var],
                                       outputs=[self.det_prediction, self.loss])
Ejemplo n.º 24
0
    def additional_layer(self, idx_layer, emb_layer, avg=False):
        suf = '_avg' if avg else ''
        if self.name == 'char':
            if self.args.char_model == 'cnn':
                lds = L.dimshuffle(emb_layer,
                                   (0, 3, 1, 2))  # (100, 16, 26, 32)
                ls = []
                for n in self.args.ngrams:
                    lconv = L.Conv2DLayer(
                        lds,
                        self.args.conv_dim,
                        (1, n),
                        untie_biases=False,
                        # W=HeNormal('relu') if not avg else Constant(),
                        W=GlorotNormal('relu') if not avg else Constant(),
                        name='conv_%d' % n + suf)  # (100, 64/4, 26, 32-n+1)

                    lpool = L.MaxPool2DLayer(lconv,
                                             (1, self.args.max_word_len - n +
                                              1))  # (100, 64, 26, 1)
                    lpool = L.flatten(lpool, outdim=3)  # (100, 16, 26)
                    lpool = L.dimshuffle(lpool, (0, 2, 1))  # (100, 26, 16)
                    ls.append(lpool)
                xc = L.concat(ls, axis=2, name='echar_concat')  # (100, 26, 64)
                # additional
                # xc = L.DenseLayer(xc, self.args.embw_dim, nonlinearity=None, name='echar_affine', num_leading_axes=2,
                # W=HeNormal() if not avg else Constant()) # (100, 26, 100)
                return xc
            elif self.args.char_model == 'lstm':
                ml = L.ExpressionLayer(
                    idx_layer,
                    lambda x: T.neq(x, 0))  # mask layer (100, 24, 32)
                ml = L.reshape(ml, (-1, self.args.max_word_len))  # (1500, 32)

                gate_params = L.recurrent.Gate(W_in=Orthogonal(),
                                               W_hid=Orthogonal())
                cell_params = L.recurrent.Gate(W_in=Orthogonal(),
                                               W_hid=Orthogonal(),
                                               W_cell=None,
                                               nonlinearity=tanh)

                lstm_in = L.reshape(
                    emb_layer,
                    (-1, self.args.max_word_len,
                     self.config['char']['emb_dim']))  # (1500, 32, 16)
                lstm_f = L.LSTMLayer(
                    lstm_in,
                    32,
                    mask_input=ml,
                    grad_clipping=10.,
                    learn_init=True,
                    peepholes=False,
                    precompute_input=True,
                    ingate=gate_params,
                    forgetgate=gate_params,
                    cell=cell_params,
                    outgate=gate_params,
                    # unroll_scan=True,
                    only_return_final=True,
                    name='forward' + suf)  # (1500, 32)
                lstm_b = L.LSTMLayer(
                    lstm_in,
                    32,
                    mask_input=ml,
                    grad_clipping=10.,
                    learn_init=True,
                    peepholes=False,
                    precompute_input=True,
                    ingate=gate_params,
                    forgetgate=gate_params,
                    cell=cell_params,
                    outgate=gate_params,
                    # unroll_scan=True,
                    only_return_final=True,
                    backwards=True,
                    name='backward' + suf)  # (1500, 32)
                remove_reg(lstm_f)
                remove_reg(lstm_b)
                if avg:
                    set_zero(lstm_f)
                    set_zero(lstm_b)
                xc = L.concat([lstm_f, lstm_b], axis=1)  # (1500, 64)
                if self.args.lstm_tagger:
                    xc = L.reshape(
                        xc, (-1, self.args.max_sent_len, 64))  # (100, 161, 64)
                elif self.args.trans_tagger:
                    xc = L.reshape(
                        xc, (-1, self.args.window_size, 64))  # (100, 15, 64)
                else:
                    xc = L.reshape(xc, (-1, 26, 64))  # (100, 26, 64)
                return xc

        elif self.name == 'morph':
            # idx (100, 26/161, 16)  emb (100, 26/161, 16, 32)
            if self.args.morph_model == 'max':
                xm = L.MaxPool2DLayer(
                    emb_layer,
                    (self.args.max_morph_len, 1))  # (100, 26/161, 1, 32)
                # xm = L.reshape(xm, (-1, 26, self.config['morph']['emb_dim'])) # (100, 26/161, 32)
                xm = L.flatten(xm, outdim=3)  # (100, 26/161, 32)
                # xm = L.ExpressionLayer(emb_layer, lambda x: T.max(x, 2))
            elif self.args.morph_model == 'avg':
                mask = L.ExpressionLayer(
                    idx_layer, lambda x: T.neq(x, 0))  # (100, 26, 16)
                mask = L.dimshuffle(mask, (0, 1, 2, 'x'))  # (100, 26, 16, 1)
                mask = L.ExpressionLayer(mask, lambda x: T.extra_ops.repeat(
                    x, self.config['morph']['emb_dim'], 3))  # (100, 26, 16, 1)
                xm = L.ElemwiseMergeLayer([
                    emb_layer, mask
                ], lambda x, m: T.sum(x * m, 2) / T.sum(m, 2))  # (100, 26, 32)
                # xm = L.reshape(xm, (-1, self.args.feat_shape, self.config['morph']['emb_dim'])) # (100, 26, 32)
            return xm
        else:
            return emb_layer
Ejemplo n.º 25
0
# Recurrent layers expect input of shape
# (batch size, max sequence length, number of features)
l_in = layers.InputLayer(shape=(N_BATCH, MAX_LENGTH, 2))
# The network also needs a way to provide a mask for each sequence.  We'll
# use a separate input layer for that.  Since the mask only determines
# which indices are part of the sequence for each batch entry, they are
# supplied as matrices of dimensionality (N_BATCH, MAX_LENGTH)
l_mask = layers.InputLayer(shape=(N_BATCH, MAX_LENGTH))
# We're using a bidirectional network, which means we will combine two
# RecurrentLayers, one with the backwards=True keyword argument.
# Setting a value for grad_clipping will clip the gradients in the layer
# Setting only_return_final=True makes the layers only return their output
# for the final time step, which is all we need for this task
l_forward = layers.LSTMLayer(l_in,
                             N_HIDDEN,
                             mask_input=l_mask,
                             grad_clipping=GRAD_CLIP,
                             only_return_final=True)
l_backward = layers.LSTMLayer(l_in,
                              N_HIDDEN,
                              mask_input=l_mask,
                              grad_clipping=GRAD_CLIP,
                              only_return_final=True,
                              backwards=True)
# Now, we'll concatenate the outputs to combine them.
l_concat = layers.ConcatLayer([l_forward, l_backward])
# Our output layer is a simple dense connection, with 1 output unit
l_out = layers.DenseLayer(l_concat,
                          num_units=1,
                          nonlinearity=lasagne.nonlinearities.tanh)
Ejemplo n.º 26
0
    def __init__(self, dim, mode, l2, l1, batch_norm, dropout,
                 batch_size, input_dim=76, **kwargs):
                
        print "==> not used params in network class:", kwargs.keys()
        
        self.dim = dim
        self.mode = mode
        self.l2 = l2
        self.l1 = l1
        self.batch_norm = batch_norm
        self.dropout = dropout
        self.batch_size = batch_size
        
        self.input_var = T.tensor3('X')
        self.input_lens = T.ivector('L')
        self.target_var = T.ivector('y')
        self.weight = T.vector('w')
        
        print "==> Building neural network"
        network = layers.InputLayer((None, None, input_dim), 
                                    input_var=self.input_var)
        network = layers.LSTMLayer(incoming=network, num_units=dim,
                                   only_return_final=False,
                                   grad_clipping=10,
                                   ingate=lasagne.layers.Gate(
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal(),
                                        W_cell=Normal(0.1)),
                                   forgetgate=lasagne.layers.Gate(
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal(),
                                        W_cell=Normal(0.1)),
                                   cell=lasagne.layers.Gate(W_cell=None,
                                        nonlinearity=lasagne.nonlinearities.tanh,
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal()),
                                   outgate=lasagne.layers.Gate(
                                        W_in=Orthogonal(),
                                        W_hid=Orthogonal(),
                                        W_cell=Normal(0.1)))
        lstm_output = layers.get_output(network)
        
        self.params = layers.get_all_params(network, trainable=True)
        self.reg_params = layers.get_all_params(network, regularizable=True)
        
        # for each example in minibatch take the last output
        last_outputs = []
        for index in range(self.batch_size):
            last_outputs.append(lstm_output[index, self.input_lens[index]-1, :])
        last_outputs = T.stack(last_outputs)

        network = layers.InputLayer(shape=(self.batch_size, self.dim), 
                                    input_var=last_outputs)
        network = layers.DenseLayer(incoming=network, num_units=2,
                                    nonlinearity=softmax)
        
        self.prediction = layers.get_output(network)
        self.params += layers.get_all_params(network, trainable=True)
        self.reg_params += layers.get_all_params(network, regularizable=True)
        
        self.loss_ce = (self.weight * categorical_crossentropy(self.prediction, 
                                                self.target_var)).mean()
        if self.l2 > 0: 
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.reg_params)
        else: 
            self.loss_l2 = 0
        
        if self.l1 > 0: 
            self.loss_l1 = self.l1 * nn_utils.l1_reg(self.reg_params)
        else: 
            self.loss_l1 = 0
            
        self.loss = self.loss_ce + self.loss_l2 + self.loss_l1
        
        #updates = lasagne.updates.adadelta(self.loss, self.params,
        #                                    learning_rate=0.001)
        #updates = lasagne.updates.momentum(self.loss, self.params,
        #                                    learning_rate=0.00003)
        #updates = lasagne.updates.adam(self.loss, self.params)
        updates = lasagne.updates.adam(self.loss, self.params, beta1=0.5,
                                       learning_rate=0.0001) # from DCGAN paper
        #updates = lasagne.updates.nesterov_momentum(loss, params, momentum=0.9,
        #                                             learning_rate=0.001,
        
        ## compiling theano functions
        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(inputs=[self.input_var,
                                                    self.input_lens,
                                                    self.target_var,
                                                    self.weight],
                                            outputs=[self.prediction, self.loss],
                                            updates=updates)
        
        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[self.input_var,
                                               self.input_lens,
                                               self.target_var,
                                               self.weight],
                                       outputs=[self.prediction, self.loss])
Ejemplo n.º 27
0
def fcrnn(
        input_var_list,
        early_conv_dict_list,
        late_conv_dict,
        dense_filter_size,
        final_pool_function=T.max,
        input_size_list=[128], output_size=10,
        last_late_conv_size=128,
        p_dropout=0.5, 
        num_feat_type = 1,
        num_lstm_unit = 512,
        gradient_steps = 10
        ):
    assert(len(early_conv_dict_list) == len(input_var_list) ==
           len(input_size_list))

    # early conv layers
    conv_network_list = list()
    total_stride_list = list()
    for jj, [early_conv_dict, input_var, input_size] in enumerate(zip(
            early_conv_dict_list, input_var_list, input_size_list)):
        input_network = lasagne.layers.InputLayer(
            shape=(None, num_feat_type, None, input_size), input_var=input_var)

        total_stride = 1
        network, total_stride = conv_layers(input_network, early_conv_dict,
                                            total_stride,
                                            init_input_size=input_size,
                                            p_dropout=0,
                                            base_name='early{}'.format(jj))
        total_stride_list.append(total_stride)
        conv_network_list.append(network)

    '''
    # upsampling
    conv_network_list = [cl.LocalExtend(net, axis=2, extend_size=ts)
                         for net, ts in zip(conv_network_list,
                                            total_stride_list)]
    '''
    network = layers.ConcatLayer(conv_network_list,
                                 axis=1,
                                 cropping=[None, None, 'lower', None],
                                 name='MultisourceConcatenate')

    # late conv layers (dense layers)
    network, total_stride = conv_layers(network, late_conv_dict,
                                        total_stride,
                                        init_input_size=1,
                                        p_dropout=p_dropout,
                                        base_name='late')

    # frame output layer. every frame has a value
    network = cl.Conv2DXLayer(
        lasagne.layers.dropout(network, p=p_dropout),
        num_filters=last_late_conv_size, filter_size=(dense_filter_size, 1),
        nonlinearity=lasagne.nonlinearities.sigmoid,
        W=lasagne.init.GlorotUniform()
    )
    network = layers.ReshapeLayer(network, ([0], [1], -1))
    network = layers.DimshuffleLayer(network, (0, 2, 1))

    # lstm layers
    l_forward = layers.LSTMLayer(network, output_size,
                                        grad_clipping=100,
                                        gradient_steps=10,
                                        nonlinearity=lasagne.nonlinearities.sigmoid)

    # l_backward = layers.LSTMLayer(l_forward, output_size,
    #                                     grad_clipping=100,
    #                                     gradient_steps=gradient_steps,
    #                                     nonlinearity=lasagne.nonlinearities.sigmoid,
    #                                     backwards=True)

    network = layers.DimshuffleLayer(l_forward, (0, 2, 1))

    # pool
    network = layers.GlobalPoolLayer(network,
                                     pool_function=final_pool_function)
    network = layers.ReshapeLayer(network, ([0], -1))

    return network