コード例 #1
0
    def _get_normalised_relevance_layer(self, layer, feeder):
        def add_epsilon(Zs):
            tmp = (T.cast(Zs >= 0, theano.config.floatX) * 2.0 - 1.0)
            return Zs + self.epsilon * tmp

        if isinstance(layer, L.DenseLayer):
            forward_layer = L.DenseLayer(layer.input_layer,
                                         layer.num_units,
                                         W=layer.W,
                                         b=layer.b,
                                         nonlinearity=None)
        elif isinstance(layer, L.Conv2DLayer):
            forward_layer = L.Conv2DLayer(layer.input_layer,
                                          num_filters=layer.num_filters,
                                          W=layer.W,
                                          b=layer.b,
                                          stride=layer.stride,
                                          filter_size=layer.filter_size,
                                          flip_filters=layer.flip_filters,
                                          untie_biases=layer.untie_biases,
                                          pad=layer.pad,
                                          nonlinearity=None)
        else:
            raise NotImplementedError()

        forward_layer = L.ExpressionLayer(forward_layer,
                                          lambda x: 1.0 / add_epsilon(x))
        feeder = L.ElemwiseMergeLayer([forward_layer, feeder],
                                      merge_function=T.mul)

        return feeder
コード例 #2
0
def layer_GatedConv2DLayer(
        incoming,
        num_filters,
        filter_size,
        stride=(1, 1),
        pad=0,
        nonlinearity=lasagne.nonlinearities.very_leaky_rectify,
        name=''):
    la = ll.Conv2DLayer(incoming,
                        num_filters=num_filters,
                        filter_size=filter_size,
                        stride=stride,
                        pad=pad,
                        nonlinearity=nonlinearity,
                        name=name + '.activation')
    lg = ll.Conv2DLayer(incoming,
                        num_filters=num_filters,
                        filter_size=filter_size,
                        stride=stride,
                        pad=pad,
                        nonlinearity=theano.tensor.nnet.nnet.sigmoid,
                        name=name + '.gate')
    lout = ll.ElemwiseMergeLayer([la, lg],
                                 T.mul,
                                 cropping=None,
                                 name=name + '.mul_merge')
    return lout
コード例 #3
0
    def _invert_DenseLayer(self, layer, feeder):
        # Warning they are swapped here
        feeder = self._put_rectifiers(feeder, layer)
        feeder = self._get_normalised_relevance_layer(layer, feeder)

        output_units = np.prod(L.get_output_shape(layer.input_layer)[1:])
        output_layer = L.DenseLayer(feeder, num_units=output_units)
        W = output_layer.W
        tmp_shape = np.asarray((-1, ) + L.get_output_shape(output_layer)[1:])
        x_layer = L.ReshapeLayer(layer.input_layer, tmp_shape.tolist())
        output_layer = L.ElemwiseMergeLayer(incomings=[x_layer, output_layer],
                                            merge_function=T.mul)
        output_layer.W = W
        return output_layer
コード例 #4
0
    def _invert_Conv2DLayer(self, layer, feeder):
        # Warning they are swapped here
        feeder = self._put_rectifiers(feeder, layer)
        feeder = self._get_normalised_relevance_layer(layer, feeder)

        f_s = layer.filter_size
        if layer.pad == 'same':
            pad = 'same'
        elif layer.pad == 'valid' or layer.pad == (0, 0):
            pad = 'full'
        else:
            raise RuntimeError("Define your padding as full or same.")

        # By definition the
        # Flip filters must be on to be a proper deconvolution.
        num_filters = L.get_output_shape(layer.input_layer)[1]
        if layer.stride == (4, 4):
            # Todo: similar code gradient based explainers. Merge.
            feeder = L.Upscale2DLayer(feeder, layer.stride, mode='dilate')
            output_layer = L.Conv2DLayer(feeder,
                                         num_filters=num_filters,
                                         filter_size=f_s,
                                         stride=1,
                                         pad=pad,
                                         nonlinearity=None,
                                         b=None,
                                         flip_filters=True)
            conv_layer = output_layer
            tmp = L.SliceLayer(output_layer, slice(0, -3), axis=3)
            output_layer = L.SliceLayer(tmp, slice(0, -3), axis=2)
            output_layer.W = conv_layer.W
        else:
            output_layer = L.Conv2DLayer(feeder,
                                         num_filters=num_filters,
                                         filter_size=f_s,
                                         stride=1,
                                         pad=pad,
                                         nonlinearity=None,
                                         b=None,
                                         flip_filters=True)
        W = output_layer.W

        # Do the multiplication.
        x_layer = L.ReshapeLayer(layer.input_layer,
                                 (-1, ) + L.get_output_shape(output_layer)[1:])
        output_layer = L.ElemwiseMergeLayer(incomings=[x_layer, output_layer],
                                            merge_function=T.mul)
        output_layer.W = W
        return output_layer
コード例 #5
0
ファイル: word2vec.py プロジェクト: julianser/word2vec
    def model(self, query_input, batch_size, query_vocab_size,
              context_vocab_size, emb_dim_size):
        l_input = L.InputLayer(shape=(batch_size, ), input_var=query_input)
        l_embed_continuous = L.EmbeddingLayer(l_input,
                                              input_size=query_vocab_size,
                                              output_size=emb_dim_size)
        l_values_discrete = L.EmbeddingLayer(l_input,
                                             input_size=query_vocab_size,
                                             output_size=emb_dim_size)
        l_probabilities_discrete = L.NonlinearityLayer(
            l_values_discrete, nonlinearity=lasagne.nonlinearities.softmax)
        l_embed_discrete = StochasticLayer(l_probabilities_discrete,
                                           estimator='MF')
        l_merge = L.ElemwiseSumLayer([l_embed_continuous, l_embed_discrete])
        l_out = L.DenseLayer(l_merge,
                             num_units=emb_dim_size,
                             nonlinearity=lasagne.nonlinearities.softmax)

        l_merge_2 = L.ElemwiseMergeLayer([l_out, l_embed_discrete],
                                         merge_function=T.mul)
        l_final_out = L.DenseLayer(l_merge_2, num_units=context_vocab_size)
        return l_values_discrete, l_final_out
コード例 #6
0
    def __init__(self, incomings, vocab_size, emb_size, W, WT=None, **kwargs):
        super(EncodingFullLayer, self).__init__(incomings, **kwargs)
        #        if len(self.input_shapes[0]) == 3:
        #            batch_size, w_count, w_length = self.input_shapes[0]
        shape = tuple(self.input_shapes[0])
        #        else:
        #            shape = tuple(self.input_shapes[0])

        self.WT = None
        #        self.reset_zero()
        self.l_in = LL.InputLayer(shape=shape)
        self.l_in_pe = LL.InputLayer(shape=shape + (emb_size, ))
        self.l_emb = LL.EmbeddingLayer(self.l_in,
                                       input_size=vocab_size,
                                       output_size=emb_size,
                                       W=W)
        self.W = self.l_emb.W
        self.l_emb = LL.ElemwiseMergeLayer((self.l_emb, self.l_in_pe),
                                           merge_function=T.mul)
        self.l_emb_res = LL.ExpressionLayer(self.l_emb,
                                            lambda X: X.sum(2),
                                            output_shape='auto')

        #        self.l_emb_res = SumLayer(self.l_emb, axis=2)
        if np.any(WT):
            self.l_emb_res = TemporalEncodicgLayer(self.l_emb_res, T=WT)
            self.WT = self.l_emb_res.T
        params = LL.helper.get_all_params(self.l_emb_res, trainable=True)
        values = LL.helper.get_all_param_values(self.l_emb_res, trainable=True)
        for p, v in zip(params, values):
            self.add_param(p, v.shape, name=p.name)

        zero_vec_tensor = T.vector()
        self.zero_vec = np.zeros(emb_size, dtype=theano.config.floatX)
        self.set_zero = theano.function(
            [zero_vec_tensor],
            updates=[(x, T.set_subtensor(x[-1, :], zero_vec_tensor))
                     for x in [self.W]])
コード例 #7
0
def build_model(hyparams,
                vocab,
                nclasses=2,
                batchsize=None,
                invar=None,
                maskvar=None,
                maxlen=MAXLEN):

    embedding_dim = hyparams.embedding_dim
    nhidden = hyparams.nhidden
    bidirectional = hyparams.bidirectional
    pool = hyparams.pool
    grad_clip = hyparams.grad_clip
    init = hyparams.init

    net = OrderedDict()

    V = len(vocab)
    W = lasagne.init.Normal()

    gate_params = layer.recurrent.Gate(
        W_in=lasagne.init.Orthogonal(),
        W_hid=lasagne.init.Orthogonal(),
        b=lasagne.init.Constant(0.)
    )
    cell_params = layer.recurrent.Gate(
        W_in=lasagne.init.Orthogonal(),
        W_hid=lasagne.init.Orthogonal(),
        W_cell=None,
        b=lasagne.init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.tanh
    )

    # define model
    net['input'] = layer.InputLayer((batchsize, maxlen), input_var=invar)
    net['mask'] = layer.InputLayer((batchsize, maxlen), input_var=maskvar)
    net['emb'] = layer.EmbeddingLayer(net['input'], input_size=V, output_size=embedding_dim, W=W)
    net['fwd1'] = layer.LSTMLayer(
        net['emb'],
        num_units=nhidden,
        grad_clipping=grad_clip,
        nonlinearity=lasagne.nonlinearities.tanh,
        mask_input=net['mask'],
        ingate=gate_params,
        forgetgate=gate_params,
        cell=cell_params,
        outgate=gate_params,
        learn_init=True
    )
    if bidirectional:
        net['bwd1'] = layer.LSTMLayer(
            net['emb'],
            num_units=nhidden,
            grad_clipping=grad_clip,
            nonlinearity=lasagne.nonlinearities.tanh,
            mask_input=net['mask'],
            ingate=gate_params,
            forgetgate=gate_params,
            cell=cell_params,
            outgate=gate_params,
            learn_init=True,
            backwards=True
        )

        def tmean(a, b):
            agg = theano.tensor.add(a, b)
            agg /= 2.
            return agg

        net['pool'] = layer.ElemwiseMergeLayer([net['fwd1'], net['bwd1']], tmean)
    else:
        net['pool'] = layer.ConcatLayer([net['fwd1']])
    net['dropout1'] = layer.DropoutLayer(net['pool'], p=0.5)
    net['fwd2'] = layer.LSTMLayer(
        net['dropout1'],
        num_units=nhidden,
        grad_clipping=grad_clip,
        nonlinearity=lasagne.nonlinearities.tanh,
        mask_input=net['mask'],
        ingate=gate_params,
        forgetgate=gate_params,
        cell=cell_params,
        outgate=gate_params,
        learn_init=True,
        only_return_final=True
    )
    net['dropout2'] = layer.DropoutLayer(net['fwd2'], p=0.6)
    net['softmax'] = layer.DenseLayer(
        net['dropout2'],
        num_units=nclasses,
        nonlinearity=lasagne.nonlinearities.softmax
    )
    ASSUME = {net['input']: (200, 140), net['mask']: (200, 140)}
    logstr = '========== MODEL ========== \n'
    logstr += 'vocab size: %d\n' % V
    logstr += 'embedding dim: %d\n' % embedding_dim
    logstr += 'nhidden: %d\n' % nhidden
    logstr += 'pooling: %s\n' % pool
    for lname, lyr in net.items():
        logstr += '%s %s\n' % (lname, str(get_output_shape(lyr, ASSUME)))
    logstr += '=========================== \n'
    print logstr
    return net
コード例 #8
0
    def __init__(self,
                 incomings,
                 vocab_size,
                 emb_size,
                 A=lasagne.init.Normal(std=0.1),
                 C=lasagne.init.Normal(std=0.1),
                 AT=lasagne.init.Normal(std=0.1),
                 CT=lasagne.init.Normal(std=0.1),
                 nonlin=lasagne.nonlinearities.softmax,
                 RN=0.,
                 **kwargs):
        super(MemoryLayer, self).__init__(incomings, **kwargs)

        self.vocab_size, self.emb_size = vocab_size, emb_size
        self.nonlin = nonlin
        self.RN = RN
        #        self.A, self.C, self.AT, self.CT = A, C, AT, CT

        batch_size, c_count, c_length = self.input_shapes[0]
        _, q_count, _ = self.input_shapes[2]

        self.l_c_in = LL.InputLayer(shape=(batch_size, c_count, c_length))
        self.l_c_in_pe = LL.InputLayer(shape=(batch_size, c_count, c_length,
                                              self.emb_size))
        self.l_u_in = LL.InputLayer(shape=(batch_size, q_count, self.emb_size))

        self.l_c_A_enc = EncodingFullLayer((self.l_c_in, self.l_c_in_pe),
                                           self.vocab_size, self.emb_size, A,
                                           AT)
        self.l_c_C_enc = EncodingFullLayer((self.l_c_in, self.l_c_in_pe),
                                           self.vocab_size, self.emb_size, C,
                                           CT)
        self.A, self.C = self.l_c_A_enc.W, self.l_c_C_enc.W
        self.AT, self.CT = self.l_c_A_enc.WT, self.l_c_C_enc.WT
        if len(incomings
               ) == 4:  # if there is also the probabilities over sentences
            self.l_in_ac_prob = LL.InputLayer(shape=(batch_size, c_count,
                                                     emb_size))
            self.l_c_A_enc_ = LL.ElemwiseMergeLayer(
                (self.l_c_A_enc, self.l_in_ac_prob), merge_function=T.mul)
            self.l_c_C_enc_ = LL.ElemwiseMergeLayer(
                (self.l_c_C_enc, self.l_in_ac_prob), merge_function=T.mul)

        self.l_u_in_tr = LL.DimshuffleLayer(self.l_u_in, pattern=(0, 2, 1))
        if len(incomings) == 4:
            self.l_p = BatchedDotLayer((self.l_c_A_enc_, self.l_u_in_tr))
        else:
            self.l_p = BatchedDotLayer((self.l_c_A_enc, self.l_u_in_tr))

        if self.l_p.output_shape[2] == 1:
            self.l_p = LL.FlattenLayer(self.l_p, outdim=2)


#            self.l_p = LL.DimshuffleLayer(self.l_p, (0, 1))

        if self.nonlin == 'MaxOut':
            raise NotImplementedError
        self.l_p = LL.NonlinearityLayer(self.l_p, nonlinearity=nonlin)
        self.l_p = LL.DimshuffleLayer(self.l_p, (0, 1, 'x'))
        #        self.l_p = LL.ReshapeLayer(self.l_p, self.l_p.output_shape + (1,))
        self.l_p = LL.ExpressionLayer(self.l_p,
                                      lambda X: X.repeat(emb_size, 2),
                                      output_shape='auto')
        ##        self.l_p = RepeatDimLayer(self.l_p, emb_size, axis=2)
        if len(incomings) == 4:
            self.l_pc = LL.ElemwiseMergeLayer((self.l_p, self.l_c_C_enc_),
                                              merge_function=T.mul)
        else:
            self.l_pc = LL.ElemwiseMergeLayer((self.l_p, self.l_c_C_enc),
                                              merge_function=T.mul)
        self.l_o = LL.ExpressionLayer(self.l_pc,
                                      lambda X: X.sum(1),
                                      output_shape='auto')
        #        self.l_o = SumLayer(self.l_pc, axis=1)
        self.l_o = LL.DimshuffleLayer(self.l_o, pattern=(0, 'x', 1))
        self.l_o_u = LL.ElemwiseMergeLayer((self.l_o, self.l_u_in),
                                           merge_function=T.add)

        params = LL.helper.get_all_params(self.l_o_u, trainable=True)
        values = LL.helper.get_all_param_values(self.l_o_u, trainable=True)
        for p, v in zip(params, values):
            self.add_param(p, v.shape, name=p.name)
コード例 #9
0
ファイル: common.py プロジェクト: mborisyak/craynn
from lasagne import layers, nonlinearities

import theano.tensor as T

__all__ = [
    'take', 'minimum', 'maximum', 'concat', 'noise', 'nothing', 'dropout',
    'dense', 'select', 'batch_norm', 'elementwise', 'elementwise_sum',
    'elementwise_mean', 'flatten', 'feature_pool', 'nonlinearity'
]

get_common_nonlinearity = lambda f=None: nonlinearities.LeakyRectify(
    0.1) if f is None else f

minimum = lambda: lambda incomings: layers.ElemwiseMergeLayer(
    incomings, merge_function=T.minimum)
maximum = lambda: lambda incomings: layers.ElemwiseMergeLayer(
    incomings, merge_function=T.maximum)
concat = lambda axis=1: lambda incomings: layers.ConcatLayer(incomings,
                                                             axis=axis)

noise = lambda sigma=0.1: lambda incoming: \
  layers.GaussianNoiseLayer(incoming, sigma=sigma) if sigma is not None and sigma > 0 else incoming

nothing = lambda incoming: incoming

dense = lambda num_units, f=None: lambda incoming: \
  layers.DenseLayer(incoming, num_units=num_units, nonlinearity=(nonlinearities.LeakyRectify(0.05) if f is None else f))

dropout = lambda p=0.1, rescale=True: lambda incoming: \
  layers.DropoutLayer(incoming, p=p, rescale=rescale) if p is not None else incoming
コード例 #10
0
    def build_network(self, vocab_size, doc_var, query_var, docmask_var,
                      qmask_var, candmask_var, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var)
        l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var)
        l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var)
        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=EMBED_DIM,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed,
            (doc_var.shape[0], doc_var.shape[1], EMBED_DIM))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=EMBED_DIM,
                                    W=l_docembed.W)

        l_fwd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True)
        l_bkd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True)

        l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1)
        l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1)
        l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice])  # B x 2D
        q = L.get_output(l_q)  # B x 2D

        l_fwd_doc_1 = L.GRULayer(l_doce,
                                 NUM_HIDDEN,
                                 grad_clipping=GRAD_CLIP,
                                 mask_input=l_docmask,
                                 gradient_steps=GRAD_STEPS,
                                 precompute_input=True)
        l_bkd_doc_1 = L.GRULayer(l_doce,
                                 NUM_HIDDEN,
                                 grad_clipping=GRAD_CLIP,
                                 mask_input=l_docmask,
                                 gradient_steps=GRAD_STEPS,
                                 precompute_input=True,
                                 backwards=True)

        l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2)
        l_doc_1 = L.dropout(l_doc_1, p=DROPOUT_RATE)

        l_fwd_q_c = L.GRULayer(l_qembed,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_qmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_q_c = L.GRULayer(l_qembed,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_qmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True,
                               backwards=True)

        l_fwd_q_slice_c = L.SliceLayer(l_fwd_q_c, -1, 1)
        l_bkd_q_slice_c = L.SliceLayer(l_bkd_q_c, 0, 1)
        l_q_c = L.ConcatLayer([l_fwd_q_slice_c, l_bkd_q_slice_c])  # B x DE

        qd = L.get_output(l_q_c)
        q_rep = T.reshape(
            T.tile(qd, (1, doc_var.shape[1])),
            (doc_var.shape[0], doc_var.shape[1], 2 * NUM_HIDDEN))  # B x N x DE

        l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN),
                                  input_var=q_rep)
        l_doc_gru_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul)

        l_fwd_doc = L.GRULayer(l_doc_gru_in,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doc_gru_in,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True,
                               backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        d = L.get_output(l_doc)  # B x N x 2D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(
            T.set_subtensor(
                T.alloc(-20., p.shape[0], p.shape[1])[candmask_var.nonzero()],
                p[candmask_var.nonzero()]))

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final = T.inc_subtensor(
            T.alloc(0., p.shape[0], vocab_size)[index,
                                                T.flatten(doc_var, outdim=2)],
            pm)

        dv = L.get_output(l_doc, deterministic=True)  # B x N x 2D
        p = T.batched_dot(dv, q)  # B x N
        pm = T.nnet.softmax(
            T.set_subtensor(
                T.alloc(-20., p.shape[0], p.shape[1])[candmask_var.nonzero()],
                p[candmask_var.nonzero()]))

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final_v = T.inc_subtensor(
            T.alloc(0., p.shape[0], vocab_size)[index,
                                                T.flatten(doc_var, outdim=2)],
            pm)

        return final, final_v, l_doc, [l_q, l_q_c]
コード例 #11
0
    def build_network(self, K, vocab_size, W_init, C_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0])
        l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1])
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2])
        l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3])
        l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6])
        l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7])
        l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN),
                               input_var=self.inps[8])
        l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN),
                                 input_var=self.inps[9])
        l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11])

        doc_shp = self.inps[1].shape
        qry_shp = self.inps[3].shape

        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=self.embed_dim,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim))  # B x N x DE
        l_qemb = L.EmbeddingLayer(l_qin,
                                  input_size=vocab_size,
                                  output_size=self.embed_dim,
                                  W=l_docembed.W)
        l_qembed = L.ReshapeLayer(
            l_qemb, (qry_shp[0], qry_shp[1], self.embed_dim))  # B x N x DE
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        if self.train_emb == 0:
            l_docembed.params[l_docembed.W].remove('trainable')
            l_qemb.params[l_qemb.W].remove('trainable')

        # char embeddings
        if self.use_subs:
            if C_init != None:
                l_lookup = L.EmbeddingLayer(l_tokin,
                                            self.num_chars,
                                            self.sub_dim,
                                            W=C_init)  # T x L x D
            else:
                l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars,
                                            self.sub_dim)  # T x L x D
            l_fgru = L.GRULayer(l_lookup,
                                self.sub_dim,
                                grad_clipping=GRAD_CLIP,
                                mask_input=l_tokmask,
                                gradient_steps=GRAD_STEPS,
                                precompute_input=True,
                                only_return_final=True)
            l_bgru = L.GRULayer(l_lookup,
                                self.sub_dim,
                                grad_clipping=GRAD_CLIP,
                                mask_input=l_tokmask,
                                gradient_steps=GRAD_STEPS,
                                precompute_input=True,
                                backwards=True,
                                only_return_final=True)  # T x 2D
            l_fwdembed = L.DenseLayer(l_fgru,
                                      self.embed_dim,
                                      nonlinearity=None)  # T x DE/2
            l_bckembed = L.DenseLayer(l_bgru,
                                      self.embed_dim,
                                      nonlinearity=None)  # T x DE/2
            l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1)
            l_docchar_embed = IndexLayer([l_doctokin, l_embed])  # B x N x DE/2
            l_qchar_embed = IndexLayer([l_qtokin, l_embed])  # B x Q x DE/2
            # l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2)
            # l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2)
            #l_doce = L.ElemwiseSumLayer([l_doce, l_docchar_embed], coeffs=1)
            #l_qembed = L.ElemwiseSumLayer([l_qembed, l_qchar_embed], coeffs=1)
            l_doce = L.ElemwiseMergeLayer([l_doce, l_docchar_embed],
                                          merge_function=T.mul)
            l_qembed = L.ElemwiseMergeLayer([l_qembed, l_qchar_embed],
                                            merge_function=T.mul)
        attentions = []
        if self.save_attn:
            l_m = PairwiseInteractionLayer([l_doce, l_qembed])
            attentions.append(L.get_output(l_m, deterministic=True))

        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     self.nhidden,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                    mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                            backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1],
                               axis=2)  # B x N x DE

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1],
                                    axis=2)  # B x Q x DE

            l_m = PairwiseInteractionLayer([l_doc_1, l_q_c_1])
            l_doc_2_in = GatedAttentionLayer([l_doc_1, l_q_c_1, l_m],
                                             gating_fn=self.gating_fn,
                                             mask_input=self.inps[7])
            l_doce = L.dropout(l_doc_2_in, p=self.dropout)  # B x N x DE
            if self.save_attn:
                attentions.append(L.get_output(l_m, deterministic=True))

        if self.use_feat:
            l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2)  # B x N x DE+2

        # final layer
        l_fwd_doc = L.GRULayer(l_doce,
                               self.nhidden,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)
        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        l_fwd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             only_return_final=False)
        l_bkd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True,
                             only_return_final=False)
        l_q = L.ConcatLayer([l_fwd_q, l_bkd_q], axis=2)  # B x Q x 2D

        if self.save_attn:
            l_m = PairwiseInteractionLayer([l_doc, l_q])
            attentions.append(L.get_output(l_m, deterministic=True))

        l_prob = AttentionSumLayer([l_doc, l_q],
                                   self.inps[4],
                                   self.inps[12],
                                   mask_input=self.inps[10])
        final = L.get_output(l_prob)
        final_v = L.get_output(l_prob, deterministic=True)
        return final, final_v, l_prob, l_docembed.W, attentions
コード例 #12
0
    def build_network(self, K, vocab_size, doc_var, query_var, docmask_var,
                      qmask_var, candmask_var, feat_var, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var)
        l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var)
        l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var)
        l_featin = L.InputLayer(shape=(None, None), input_var=feat_var)
        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=EMBED_DIM,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed,
            (doc_var.shape[0], doc_var.shape[1], EMBED_DIM))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=EMBED_DIM,
                                    W=l_docembed.W)
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        if not EMB_TRAIN: l_docembed.params[l_docembed.W].remove('trainable')

        l_fwd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True)
        l_bkd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True)

        l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1)
        l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1)
        l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice])  # B x 2D
        q = L.get_output(l_q)  # B x 2D

        l_qs = [l_q]
        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     NUM_HIDDEN,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce,
                                     NUM_HIDDEN,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True,
                                     backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2)

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   NUM_HIDDEN,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   NUM_HIDDEN,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_fwd_q_slice_1 = L.SliceLayer(l_fwd_q_1, -1, 1)
            l_bkd_q_slice_1 = L.SliceLayer(l_bkd_q_1, 0, 1)
            l_q_c_1 = L.ConcatLayer([l_fwd_q_slice_1,
                                     l_bkd_q_slice_1])  # B x DE

            l_qs.append(l_q_c_1)

            qd = L.get_output(l_q_c_1)
            q_rep = T.reshape(T.tile(qd, (1, doc_var.shape[1])),
                              (doc_var.shape[0], doc_var.shape[1],
                               2 * NUM_HIDDEN))  # B x N x DE

            l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN),
                                      input_var=q_rep)
            l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul)
            l_doce = L.dropout(l_doc_2_in, p=DROPOUT_RATE)  # B x N x DE

        l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2)  # B x N x DE+2
        l_fwd_doc = L.GRULayer(l_doce,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP,
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        d = L.get_output(l_doc)  # B x N x 2D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(p) * candmask_var
        pm = pm / pm.sum(axis=1)[:, np.newaxis]

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,T.flatten(doc_var,outdim=2)],\
                pm)

        dv = L.get_output(l_doc, deterministic=True)  # B x N x 2D
        p = T.batched_dot(dv, q)  # B x N
        pm = T.nnet.softmax(p) * candmask_var
        pm = pm / pm.sum(axis=1)[:, np.newaxis]

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final_v = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,\
                T.flatten(doc_var,outdim=2)],pm)

        return final, final_v, l_doc, l_qs
コード例 #13
0
ファイル: feat.py プロジェクト: EggplantElf/sclem2017-tagger
    def additional_layer(self, idx_layer, emb_layer, avg=False):
        suf = '_avg' if avg else ''
        if self.name == 'char':
            if self.args.char_model == 'cnn':
                lds = L.dimshuffle(emb_layer,
                                   (0, 3, 1, 2))  # (100, 16, 26, 32)
                ls = []
                for n in self.args.ngrams:
                    lconv = L.Conv2DLayer(
                        lds,
                        self.args.conv_dim,
                        (1, n),
                        untie_biases=False,
                        # W=HeNormal('relu') if not avg else Constant(),
                        W=GlorotNormal('relu') if not avg else Constant(),
                        name='conv_%d' % n + suf)  # (100, 64/4, 26, 32-n+1)

                    lpool = L.MaxPool2DLayer(lconv,
                                             (1, self.args.max_word_len - n +
                                              1))  # (100, 64, 26, 1)
                    lpool = L.flatten(lpool, outdim=3)  # (100, 16, 26)
                    lpool = L.dimshuffle(lpool, (0, 2, 1))  # (100, 26, 16)
                    ls.append(lpool)
                xc = L.concat(ls, axis=2, name='echar_concat')  # (100, 26, 64)
                # additional
                # xc = L.DenseLayer(xc, self.args.embw_dim, nonlinearity=None, name='echar_affine', num_leading_axes=2,
                # W=HeNormal() if not avg else Constant()) # (100, 26, 100)
                return xc
            elif self.args.char_model == 'lstm':
                ml = L.ExpressionLayer(
                    idx_layer,
                    lambda x: T.neq(x, 0))  # mask layer (100, 24, 32)
                ml = L.reshape(ml, (-1, self.args.max_word_len))  # (1500, 32)

                gate_params = L.recurrent.Gate(W_in=Orthogonal(),
                                               W_hid=Orthogonal())
                cell_params = L.recurrent.Gate(W_in=Orthogonal(),
                                               W_hid=Orthogonal(),
                                               W_cell=None,
                                               nonlinearity=tanh)

                lstm_in = L.reshape(
                    emb_layer,
                    (-1, self.args.max_word_len,
                     self.config['char']['emb_dim']))  # (1500, 32, 16)
                lstm_f = L.LSTMLayer(
                    lstm_in,
                    32,
                    mask_input=ml,
                    grad_clipping=10.,
                    learn_init=True,
                    peepholes=False,
                    precompute_input=True,
                    ingate=gate_params,
                    forgetgate=gate_params,
                    cell=cell_params,
                    outgate=gate_params,
                    # unroll_scan=True,
                    only_return_final=True,
                    name='forward' + suf)  # (1500, 32)
                lstm_b = L.LSTMLayer(
                    lstm_in,
                    32,
                    mask_input=ml,
                    grad_clipping=10.,
                    learn_init=True,
                    peepholes=False,
                    precompute_input=True,
                    ingate=gate_params,
                    forgetgate=gate_params,
                    cell=cell_params,
                    outgate=gate_params,
                    # unroll_scan=True,
                    only_return_final=True,
                    backwards=True,
                    name='backward' + suf)  # (1500, 32)
                remove_reg(lstm_f)
                remove_reg(lstm_b)
                if avg:
                    set_zero(lstm_f)
                    set_zero(lstm_b)
                xc = L.concat([lstm_f, lstm_b], axis=1)  # (1500, 64)
                if self.args.lstm_tagger:
                    xc = L.reshape(
                        xc, (-1, self.args.max_sent_len, 64))  # (100, 161, 64)
                elif self.args.trans_tagger:
                    xc = L.reshape(
                        xc, (-1, self.args.window_size, 64))  # (100, 15, 64)
                else:
                    xc = L.reshape(xc, (-1, 26, 64))  # (100, 26, 64)
                return xc

        elif self.name == 'morph':
            # idx (100, 26/161, 16)  emb (100, 26/161, 16, 32)
            if self.args.morph_model == 'max':
                xm = L.MaxPool2DLayer(
                    emb_layer,
                    (self.args.max_morph_len, 1))  # (100, 26/161, 1, 32)
                # xm = L.reshape(xm, (-1, 26, self.config['morph']['emb_dim'])) # (100, 26/161, 32)
                xm = L.flatten(xm, outdim=3)  # (100, 26/161, 32)
                # xm = L.ExpressionLayer(emb_layer, lambda x: T.max(x, 2))
            elif self.args.morph_model == 'avg':
                mask = L.ExpressionLayer(
                    idx_layer, lambda x: T.neq(x, 0))  # (100, 26, 16)
                mask = L.dimshuffle(mask, (0, 1, 2, 'x'))  # (100, 26, 16, 1)
                mask = L.ExpressionLayer(mask, lambda x: T.extra_ops.repeat(
                    x, self.config['morph']['emb_dim'], 3))  # (100, 26, 16, 1)
                xm = L.ElemwiseMergeLayer([
                    emb_layer, mask
                ], lambda x, m: T.sum(x * m, 2) / T.sum(m, 2))  # (100, 26, 32)
                # xm = L.reshape(xm, (-1, self.args.feat_shape, self.config['morph']['emb_dim'])) # (100, 26, 32)
            return xm
        else:
            return emb_layer
コード例 #14
0
    def build_network(self, K, vocab_size, doc_var, query_var, cand_var,
                      docmask_var, qmask_var, candmask_var, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var)
        l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var)
        l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var)
        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=self.embed_dim,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed,
            (doc_var.shape[0], doc_var.shape[1], self.embed_dim))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=self.embed_dim,
                                    W=l_docembed.W)

        if self.train_emb == 0:
            l_docembed.params[l_docembed.W].remove('trainable')

        l_fwd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True)
        l_bkd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True)

        l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1)
        l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1)
        l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice])  # B x 2D
        q = L.get_output(l_q)  # B x 2D

        l_qs = [l_q]
        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     self.nhidden,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce,
                                     self.nhidden,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True,
                                     backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2)

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_fwd_q_slice_1 = L.SliceLayer(l_fwd_q_1, -1, 1)
            l_bkd_q_slice_1 = L.SliceLayer(l_bkd_q_1, 0, 1)
            l_q_c_1 = L.ConcatLayer([l_fwd_q_slice_1,
                                     l_bkd_q_slice_1])  # B x DE

            l_qs.append(l_q_c_1)

            qd = L.get_output(l_q_c_1)
            q_rep = T.reshape(T.tile(qd, (1, doc_var.shape[1])),
                              (doc_var.shape[0], doc_var.shape[1],
                               2 * self.nhidden))  # B x N x DE

            l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden),
                                      input_var=q_rep)
            l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul)
            l_doce = L.dropout(l_doc_2_in, p=self.dropout)

        l_fwd_doc = L.GRULayer(l_doce,
                               self.nhidden,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce,
                               self.nhidden,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True,
                               backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        d = L.get_output(l_doc)  # B x N x 2D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(p) * candmask_var
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final = T.batched_dot(pm, cand_var)

        dv = L.get_output(l_doc, deterministic=True)  # B x N x 2D
        p = T.batched_dot(dv, q)  # B x N
        pm = T.nnet.softmax(p) * candmask_var
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final_v = T.batched_dot(pm, cand_var)

        return final, final_v, l_doc, l_qs, l_docembed.W
コード例 #15
0
    def _build_net(self, emb_char_filter_size=5, emb_dropout=True, **kwargs):

        batch_size = self.mask_context_var.shape[0]
        context_len = self.mask_context_var.shape[1]
        question_len = self.question_var.shape[1]
        context_word_len = self.context_char_var.shape[2]
        question_word_len = self.question_char_var.shape[2]

        self.batch_size = batch_size
        self.context_len = context_len
        ''' Inputs and word embeddings'''

        l_context_char = LL.InputLayer(shape=(None, None, None),
                                       input_var=self.context_char_var)
        l_question_char = LL.InputLayer(shape=(None, None, None),
                                        input_var=self.question_char_var)

        l_c_mask = LL.InputLayer(shape=(None, None),
                                 input_var=self.mask_context_var)
        l_q_mask = LL.InputLayer(shape=(None, None),
                                 input_var=self.mask_question_var)

        l_c_char_mask = LL.InputLayer(shape=(None, None, None),
                                      input_var=self.mask_context_char_var)
        l_q_char_mask = LL.InputLayer(shape=(None, None, None),
                                      input_var=self.mask_question_char_var)

        l_c_emb = LL.InputLayer(shape=(None, None, self.emb_size),
                                input_var=self.context_var)
        l_q_emb = LL.InputLayer(shape=(None, None, self.emb_size),
                                input_var=self.question_var)

        if self.train_unk:
            l_c_unk_mask = LL.InputLayer(shape=(None, None),
                                         input_var=self.mask_context_unk_var)
            l_q_unk_mask = LL.InputLayer(shape=(None, None),
                                         input_var=self.mask_question_unk_var)

            l_c_emb = TrainUnkLayer(l_c_emb,
                                    l_c_unk_mask,
                                    output_size=self.emb_size,
                                    W=self.word_embeddings[0])

            l_q_emb = TrainUnkLayer(l_q_emb,
                                    l_q_unk_mask,
                                    output_size=self.emb_size,
                                    W=l_c_emb.W)

        if self.negative:
            l_c_emb = TrainNAWLayer(l_c_emb,
                                    l_c_mask,
                                    output_size=self.emb_size)
        ''' Char-embeddings '''

        # (batch_size x context_len x context_word_len x emb_char_size)
        l_c_char_emb = LL.EmbeddingLayer(l_context_char,
                                         input_size=self.alphabet_size,
                                         output_size=self.emb_char_size)

        l_q_char_emb = LL.EmbeddingLayer(l_question_char,
                                         input_size=self.alphabet_size,
                                         output_size=self.emb_char_size,
                                         W=l_c_char_emb.W)

        # here I do multiplication of character embeddings with masks,
        # because I want to pad them with constant zeros

        l_c_char_mask = ForgetSizeLayer(
            LL.dimshuffle(l_c_char_mask, (0, 1, 2, 'x')))
        l_q_char_mask = ForgetSizeLayer(
            LL.dimshuffle(l_q_char_mask, (0, 1, 2, 'x')))

        l_c_char_emb = LL.ElemwiseMergeLayer([l_c_char_emb, l_c_char_mask],
                                             T.mul)
        l_q_char_emb = LL.ElemwiseMergeLayer([l_q_char_emb, l_q_char_mask],
                                             T.mul)

        # convolutions

        l_c_char_emb = LL.dimshuffle(
            LL.reshape(l_c_char_emb, (batch_size * context_len,
                                      context_word_len, self.emb_char_size)),
            (0, 2, 1))
        l_c_char_conv = LL.Conv1DLayer(l_c_char_emb,
                                       num_filters=self.num_emb_char_filters,
                                       filter_size=emb_char_filter_size,
                                       nonlinearity=L.nonlinearities.tanh,
                                       pad=self.conv)
        # (batch_size * context_len x num_filters x context_word_len + filter_size - 1)

        l_c_char_emb = LL.ExpressionLayer(l_c_char_conv,
                                          lambda X: X.max(2),
                                          output_shape='auto')
        l_c_char_emb = LL.reshape(
            l_c_char_emb, (batch_size, context_len, self.num_emb_char_filters))

        l_q_char_emb = LL.dimshuffle(
            LL.reshape(l_q_char_emb, (batch_size * question_len,
                                      question_word_len, self.emb_char_size)),
            (0, 2, 1))
        l_q_char_conv = LL.Conv1DLayer(l_q_char_emb,
                                       num_filters=self.num_emb_char_filters,
                                       filter_size=emb_char_filter_size,
                                       nonlinearity=L.nonlinearities.tanh,
                                       W=l_c_char_conv.W,
                                       b=l_c_char_conv.b,
                                       pad=self.conv)
        # (batch_size * question_len x num_filters x question_word_len + filter_size - 1)

        l_q_char_emb = LL.ExpressionLayer(l_q_char_conv,
                                          lambda X: X.max(2),
                                          output_shape='auto')
        l_q_char_emb = LL.reshape(
            l_q_char_emb,
            (batch_size, question_len, self.num_emb_char_filters))
        ''' Concatenating both embeddings '''

        l_c_emb = LL.concat([l_c_emb, l_c_char_emb], axis=2)
        l_q_emb = LL.concat([l_q_emb, l_q_char_emb], axis=2)

        # originally I had dropout here
        ''' Highway layer allowing for interaction between embeddings '''

        l_c_P = LL.reshape(l_c_emb,
                           (batch_size * context_len,
                            self.emb_size + self.num_emb_char_filters))
        l_c_P = LL.DenseLayer(l_c_P,
                              num_units=self.rec_size,
                              b=None,
                              nonlinearity=None)

        l_c_high = HighwayLayer(l_c_P)
        l_c_emb = LL.reshape(l_c_high,
                             (batch_size, context_len, self.rec_size))

        l_q_P = LL.reshape(l_q_emb,
                           (batch_size * question_len,
                            self.emb_size + self.num_emb_char_filters))
        l_q_P = LL.DenseLayer(l_q_P,
                              num_units=self.rec_size,
                              W=l_c_P.W,
                              b=None,
                              nonlinearity=None)

        l_q_high = HighwayLayer(l_q_P,
                                W1=l_c_high.W1,
                                b1=l_c_high.b1,
                                W2=l_c_high.W2,
                                b2=l_c_high.b2)
        l_q_emb = LL.reshape(l_q_high,
                             (batch_size, question_len, self.rec_size))
        ''' Calculating wiq features from https://arxiv.org/abs/1703.04816 '''

        l_weighted_feat = WeightedFeatureLayer(
            [l_c_emb, l_q_emb, l_c_mask, l_q_mask])  # batch_size x context_len
        l_weighted_feat = LL.dimshuffle(l_weighted_feat, (0, 1, 'x'))

        # batch_size x context_len
        l_bin_feat = LL.InputLayer(shape=(None, None),
                                   input_var=self.bin_feat_var)
        l_bin_feat = LL.dimshuffle(l_bin_feat, (0, 1, 'x'))
        ''' Dropout at the embeddings '''

        if emb_dropout:
            print('Using dropout after wiq calculation.')
            l_c_emb = LL.dropout(l_c_emb)
            l_q_emb = LL.dropout(l_q_emb)
        ''' Here we concatenate wiq features to embeddings'''

        # both features are concatenated to the embeddings
        # for the question we fix the features to 1
        l_c_emb = LL.concat([l_c_emb, l_bin_feat, l_weighted_feat], axis=2)
        l_q_emb = LL.pad(l_q_emb,
                         width=[(0, 2)],
                         val=L.utils.floatX(1),
                         batch_ndim=2)
        ''' Context and question encoding using the same BiLSTM for both '''

        # output shape is (batch_size x context_len x rec_size)
        l_c_enc_forw = LL.LSTMLayer(l_c_emb,
                                    num_units=self.rec_size,
                                    grad_clipping=100,
                                    mask_input=l_c_mask)

        l_c_enc_back = LL.LSTMLayer(l_c_emb,
                                    num_units=self.rec_size,
                                    grad_clipping=100,
                                    mask_input=l_c_mask,
                                    backwards=True)

        # output shape is (batch_size x question_len x rec_size)
        l_q_enc_forw = LL.LSTMLayer(
            l_q_emb,
            num_units=self.rec_size,
            grad_clipping=100,
            mask_input=l_q_mask,
            ingate=LL.Gate(W_in=l_c_enc_forw.W_in_to_ingate,
                           W_hid=l_c_enc_forw.W_hid_to_ingate,
                           W_cell=l_c_enc_forw.W_cell_to_ingate,
                           b=l_c_enc_forw.b_ingate),
            forgetgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_forgetgate,
                               W_hid=l_c_enc_forw.W_hid_to_forgetgate,
                               W_cell=l_c_enc_forw.W_cell_to_forgetgate,
                               b=l_c_enc_forw.b_forgetgate),
            outgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_outgate,
                            W_hid=l_c_enc_forw.W_hid_to_outgate,
                            W_cell=l_c_enc_forw.W_cell_to_outgate,
                            b=l_c_enc_forw.b_outgate),
            cell=LL.Gate(W_in=l_c_enc_forw.W_in_to_cell,
                         W_hid=l_c_enc_forw.W_hid_to_cell,
                         W_cell=None,
                         b=l_c_enc_forw.b_cell,
                         nonlinearity=L.nonlinearities.tanh))

        l_q_enc_back = LL.LSTMLayer(
            l_q_emb,
            num_units=self.rec_size,
            grad_clipping=100,
            mask_input=l_q_mask,
            backwards=True,
            ingate=LL.Gate(W_in=l_c_enc_back.W_in_to_ingate,
                           W_hid=l_c_enc_back.W_hid_to_ingate,
                           W_cell=l_c_enc_back.W_cell_to_ingate,
                           b=l_c_enc_back.b_ingate),
            forgetgate=LL.Gate(W_in=l_c_enc_back.W_in_to_forgetgate,
                               W_hid=l_c_enc_back.W_hid_to_forgetgate,
                               W_cell=l_c_enc_back.W_cell_to_forgetgate,
                               b=l_c_enc_back.b_forgetgate),
            outgate=LL.Gate(W_in=l_c_enc_back.W_in_to_outgate,
                            W_hid=l_c_enc_back.W_hid_to_outgate,
                            W_cell=l_c_enc_back.W_cell_to_outgate,
                            b=l_c_enc_back.b_outgate),
            cell=LL.Gate(W_in=l_c_enc_back.W_in_to_cell,
                         W_hid=l_c_enc_back.W_hid_to_cell,
                         W_cell=None,
                         b=l_c_enc_back.b_cell,
                         nonlinearity=L.nonlinearities.tanh))

        # batch_size x context_len  x 2*rec_size
        l_c_enc = LL.concat([l_c_enc_forw, l_c_enc_back], axis=2)
        # batch_size x question_len x 2*rec_size
        l_q_enc = LL.concat([l_q_enc_forw, l_q_enc_back], axis=2)

        def proj_init():
            return np.vstack([
                np.eye(self.rec_size, dtype=theano.config.floatX),
                np.eye(self.rec_size, dtype=theano.config.floatX)
            ])

        # this is H from the paper, shape: (batch_size * context_len x
        # rec_size)
        l_c_proj = LL.reshape(l_c_enc,
                              (batch_size * context_len, 2 * self.rec_size))
        l_c_proj = LL.DenseLayer(l_c_proj,
                                 num_units=self.rec_size,
                                 W=proj_init(),
                                 b=None,
                                 nonlinearity=L.nonlinearities.tanh)

        # this is Z from the paper, shape: (batch_size * question_len x
        # rec_size)
        l_q_proj = LL.reshape(l_q_enc,
                              (batch_size * question_len, 2 * self.rec_size))
        l_q_proj = LL.DenseLayer(l_q_proj,
                                 num_units=self.rec_size,
                                 W=proj_init(),
                                 b=None,
                                 nonlinearity=L.nonlinearities.tanh)
        ''' Additional, weighted question encoding (alphas from paper) '''

        l_alpha = LL.DenseLayer(
            l_q_proj,  # batch_size * question_len x 1
            num_units=1,
            b=None,
            nonlinearity=None)

        # batch_size x question_len
        l_alpha = MaskedSoftmaxLayer(
            LL.reshape(l_alpha, (batch_size, question_len)), l_q_mask)

        # batch_size x rec_size
        l_z_hat = BatchedDotLayer([
            LL.reshape(l_q_proj, (batch_size, question_len, self.rec_size)),
            l_alpha
        ])

        return l_c_proj, l_z_hat
コード例 #16
0
    def build_network(self, K, vocab_size, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0])
        l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1])
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2])
        l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3])
        l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6])
        l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7])
        l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN),
                               input_var=self.inps[8])
        l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN),
                                 input_var=self.inps[9])
        l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11])

        doc_shp = self.inps[1].shape
        qry_shp = self.inps[3].shape

        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=self.embed_dim,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=self.embed_dim,
                                    W=l_docembed.W)
        l_qembed = L.ReshapeLayer(
            l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim))  # B x N x DE
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        if self.train_emb == 0:
            l_docembed.params[l_docembed.W].remove('trainable')

        # char embeddings
        if self.use_chars:
            l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars,
                                        2 * self.char_dim)  # T x L x D
            l_fgru = L.GRULayer(l_lookup,
                                self.char_dim,
                                grad_clipping=GRAD_CLIP,
                                mask_input=l_tokmask,
                                gradient_steps=GRAD_STEPS,
                                precompute_input=True,
                                only_return_final=True)
            l_bgru = L.GRULayer(l_lookup,
                                2 * self.char_dim,
                                grad_clipping=GRAD_CLIP,
                                mask_input=l_tokmask,
                                gradient_steps=GRAD_STEPS,
                                precompute_input=True,
                                backwards=True,
                                only_return_final=True)  # T x 2D
            l_fwdembed = L.DenseLayer(l_fgru,
                                      self.embed_dim / 2,
                                      nonlinearity=None)  # T x DE/2
            l_bckembed = L.DenseLayer(l_bgru,
                                      self.embed_dim / 2,
                                      nonlinearity=None)  # T x DE/2
            l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1)
            l_docchar_embed = IndexLayer([l_doctokin, l_embed])  # B x N x DE/2
            l_qchar_embed = IndexLayer([l_qtokin, l_embed])  # B x Q x DE/2

            l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2)
            l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2)

        l_fwd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             only_return_final=False)
        l_bkd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True,
                             only_return_final=False)

        l_q = L.ConcatLayer([l_fwd_q, l_bkd_q])  # B x Q x 2D
        q = L.get_output(l_q)  # B x Q x 2D
        q = q[T.arange(q.shape[0]), self.inps[12], :]  # B x 2D

        l_qs = [l_q]
        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     self.nhidden,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                    mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                            backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1],
                               axis=2)  # B x N x DE

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1],
                                    axis=2)  # B x Q x DE
            l_qs.append(l_q_c_1)

            qd = L.get_output(l_q_c_1)  # B x Q x DE
            dd = L.get_output(l_doc_1)  # B x N x DE
            M = T.batched_dot(dd, qd.dimshuffle((0, 2, 1)))  # B x N x Q
            alphas = T.nnet.softmax(
                T.reshape(M, (M.shape[0] * M.shape[1], M.shape[2])))
            alphas_r = T.reshape(alphas, (M.shape[0],M.shape[1],M.shape[2]))* \
                    self.inps[7][:,np.newaxis,:] # B x N x Q
            alphas_r = alphas_r / alphas_r.sum(axis=2)[:, :,
                                                       np.newaxis]  # B x N x Q
            q_rep = T.batched_dot(alphas_r, qd)  # B x N x DE

            l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden),
                                      input_var=q_rep)
            l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul)
            l_doce = L.dropout(l_doc_2_in, p=self.dropout)  # B x N x DE

        if self.use_feat:
            l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2)  # B x N x DE+2
        l_fwd_doc = L.GRULayer(l_doce,
                               self.nhidden,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        d = L.get_output(l_doc)  # B x N x 2D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(p) * self.inps[10]
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final = T.batched_dot(pm, self.inps[4])

        dv = L.get_output(l_doc, deterministic=True)  # B x N x 2D
        p = T.batched_dot(dv, q)  # B x N
        pm = T.nnet.softmax(p) * self.inps[10]
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final_v = T.batched_dot(pm, self.inps[4])

        return final, final_v, l_doc, l_qs, l_docembed.W
コード例 #17
0
    def __init__(
        self,
        image_shape,
        filter_shape,
        num_class,
        conv_type,
        kernel_size,
        kernel_pool_size,
        dropout_rate,
    ):
        """

        """
        self.filter_shape = filter_shape
        self.n_visible = numpy.prod(image_shape)
        self.n_layers = len(filter_shape)
        self.rng = RandomStreams(123)
        self.x = T.matrix()
        self.y = T.ivector()

        self.conv_layers = []

        NoiseLayer = layers.DropoutLayer

        dropout_rate = float(dropout_rate)

        self.l_input = layers.InputLayer((None, self.n_visible), self.x)
        this_layer = layers.ReshapeLayer(self.l_input, ([0], ) + image_shape)

        for l in range(self.n_layers):
            activation = lasagne.nonlinearities.rectify
            if len(filter_shape[l]) == 3:
                if conv_type == 'double' and filter_shape[l][1] > kernel_size:
                    this_layer = DoubleConvLayer(
                        this_layer,
                        filter_shape[l][0],
                        filter_shape[l][1:],
                        pad='same',
                        nonlinearity=activation,
                        kernel_size=kernel_size,
                        kernel_pool_size=kernel_pool_size)
                    this_layer = layers.batch_norm(this_layer)
                elif conv_type == 'maxout':
                    this_layer = layers.Conv2DLayer(this_layer,
                                                    filter_shape[l][0],
                                                    filter_shape[l][1:],
                                                    b=None,
                                                    pad='same',
                                                    nonlinearity=None)
                    this_layer = layers.FeaturePoolLayer(
                        this_layer, pool_size=kernel_pool_size**2)
                    this_layer = layers.BatchNormLayer(this_layer)
                    this_layer = layers.NonlinearityLayer(
                        this_layer, activation)

                elif conv_type == 'cyclic':
                    this_layers = []
                    this_layers.append(
                        layers.Conv2DLayer(this_layer,
                                           filter_shape[l][0],
                                           filter_shape[l][1:],
                                           b=None,
                                           pad='same',
                                           nonlinearity=None))
                    for _ in range(3):
                        W = this_layers[-1].W.dimshuffle(0, 1, 3,
                                                         2)[:, :, :, ::-1]
                        this_layers.append(
                            layers.Conv2DLayer(this_layer,
                                               filter_shape[l][0],
                                               filter_shape[l][1:],
                                               W=W,
                                               b=None,
                                               pad='same',
                                               nonlinearity=None))
                    this_layer = layers.ElemwiseMergeLayer(
                        this_layers, T.maximum)
                    this_layer = layers.BatchNormLayer(this_layer)
                    this_layer = layers.NonlinearityLayer(
                        this_layer, activation)

                elif conv_type == 'standard' \
                     or (conv_type == 'double' and filter_shape[l][1] <= kernel_size):
                    this_layer = layers.Conv2DLayer(this_layer,
                                                    filter_shape[l][0],
                                                    filter_shape[l][1:],
                                                    pad='same',
                                                    nonlinearity=activation)
                    this_layer = layers.batch_norm(this_layer)
                else:
                    raise NotImplementedError

                self.conv_layers.append(this_layer)

            elif len(filter_shape[l]) == 2:
                this_layer = layers.MaxPool2DLayer(this_layer, filter_shape[l])
                this_layer = NoiseLayer(this_layer, dropout_rate)
            elif len(filter_shape[l]) == 1:
                raise NotImplementedError

        self.top_conv_layer = this_layer
        this_layer = layers.GlobalPoolLayer(this_layer, T.mean)
        self.clf_layer = layers.DenseLayer(this_layer,
                                           num_class,
                                           W=lasagne.init.Constant(0.),
                                           nonlinearity=T.nnet.softmax)

        self.params = layers.get_all_params(self.clf_layer, trainable=True)

        self.params_all = layers.get_all_params(self.clf_layer)
コード例 #18
0
def build_model(vmap,
                nclasses=2,
                embedding_dim=50,
                nhidden=256,
                batchsize=None,
                invar=None,
                maskvar=None,
                bidirectional=True,
                pool=True,
                grad_clip=100,
                maxlen=MAXLEN):

    V = len(vmap)
    W = lasagne.init.Normal()

    # Input Layer
    # TODO: should be (batchsize, maxlen, vocab_size)
    l_in = layer.InputLayer((batchsize, maxlen, V), input_var=invar)
    l_mask = layer.InputLayer((batchsize, maxlen), input_var=maskvar)
    ASSUME = {l_in: (200, 140, 94), l_mask: (200, 140)}
    print 'Input Layer'
    print 'output:', get_output_shape(l_in, ASSUME)
    print 'output(mask):', get_output_shape(l_mask, ASSUME)
    print

    # Embedding Layer
    l_emb = layer.EmbeddingLayer(l_in,
                                 input_size=V,
                                 output_size=embedding_dim,
                                 W=W)
    print 'Embedding Layer'
    print 'output:', get_output_shape(l_emb, ASSUME)

    gate_params = layer.recurrent.Gate(W_in=lasagne.init.Orthogonal(),
                                       W_hid=lasagne.init.Orthogonal(),
                                       b=lasagne.init.Constant(0.))

    cell_params = layer.recurrent.Gate(
        W_in=lasagne.init.Orthogonal(),
        W_hid=lasagne.init.Orthogonal(),
        W_cell=None,
        b=lasagne.init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.tanh)

    l_fwd = layer.LSTMLayer(l_emb,
                            num_units=nhidden,
                            grad_clipping=grad_clip,
                            nonlinearity=lasagne.nonlinearities.tanh,
                            mask_input=l_mask,
                            ingate=gate_params,
                            forgetgate=gate_params,
                            cell=cell_params,
                            outgate=gate_params,
                            learn_init=True)

    print 'Forward LSTM'
    print 'output:', get_output_shape(l_fwd, ASSUME)

    l_concat = None
    if bidirectional:
        l_bwd = layer.LSTMLayer(l_emb,
                                num_units=nhidden,
                                grad_clipping=grad_clip,
                                nonlinearity=lasagne.nonlinearities.tanh,
                                mask_input=l_mask,
                                ingate=gate_params,
                                forgetgate=gate_params,
                                cell=cell_params,
                                outgate=gate_params,
                                learn_init=True,
                                backwards=True)
        print 'Backward LSTM'
        print 'output:', get_output_shape(l_bwd, ASSUME)

        def tmean(a, b):
            agg = theano.tensor.add(a, b)
            agg /= 2.
            return agg

        if pool:
            l_concat = layer.ElemwiseMergeLayer([l_fwd, l_bwd], tmean)
        else:
            l_concat = layer.ConcatLayer([l_fwd, l_bwd])
    else:
        l_concat = layer.ConcatLayer([l_fwd])
    print 'Concat'
    print 'output:', get_output_shape(l_concat, ASSUME)

    l_concat = layer.DropoutLayer(l_concat, p=0.5)

    l_lstm2 = layer.LSTMLayer(l_concat,
                              num_units=nhidden,
                              grad_clipping=grad_clip,
                              nonlinearity=lasagne.nonlinearities.tanh,
                              mask_input=l_mask,
                              ingate=gate_params,
                              forgetgate=gate_params,
                              cell=cell_params,
                              outgate=gate_params,
                              learn_init=True,
                              only_return_final=True)

    print 'LSTM #2'
    print 'output:', get_output_shape(l_lstm2, ASSUME)

    l_lstm2 = layer.DropoutLayer(l_lstm2, p=0.6)

    network = layer.DenseLayer(l_lstm2,
                               num_units=nclasses,
                               nonlinearity=lasagne.nonlinearities.softmax)

    print 'Dense Layer'
    print 'output:', get_output_shape(network, ASSUME)

    return network