Ejemplo n.º 1
0
        def __init__(self,  We_initial, params):
                self.textfile = open(params.outfile, 'w')
                We = theano.shared(We_initial)
                embsize = We_initial.shape[1]
                hidden = params.hidden
                trans = np.random.uniform(-0.01, 0.01, (26, 26)).astype('float32')
                transition = theano.shared(trans)

                input_var = T.imatrix(name='inputs')
                target_var = T.imatrix(name='targets')
                mask_var = T.fmatrix(name='masks')
                mask_var1 = T.fmatrix(name='masks1')
                length = T.iscalar()


                l_in_word = lasagne.layers.InputLayer((None, None))
                l_mask_word = lasagne.layers.InputLayer(shape=(None, None))
                if params.emb ==1:
                        l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word,  input_size= We_initial.shape[0] , output_size = embsize, W =We)
                else:
                        l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

                l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word)
                l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards = True)
                concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)	
                l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden))
                l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= 25, nonlinearity=lasagne.nonlinearities.linear)

                local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var})
                local_energy = local_energy.reshape((-1, length, 25))
                local_energy = local_energy*mask_var[:,:,None]
                end_term = transition[:-1,-1]
                local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None]		
                length_index = T.sum(mask_var, axis=1)   

                loss_train = crf_loss0(local_energy,  transition, target_var, mask_var).mean()
                prediction, corr = crf_accuracy0(local_energy, transition, target_var, mask_var)
                corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
                num_tokens = mask_var.sum(dtype=theano.config.floatX)

                network_params = lasagne.layers.get_all_params(l_local, trainable=True)
                network_params.append(transition)
                print network_params
                self.network_params = network_params
                loss_train = loss_train + params.l2*sum(lasagne.regularization.l2(x) for x in network_params)
                updates = lasagne.updates.sgd(loss_train, network_params, params.eta)
                updates = lasagne.updates.apply_momentum(updates, network_params, momentum=0.9)
                self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length], [loss_train, corr_train, num_tokens, local_energy], updates=updates, on_unused_input='ignore')
                self.eval_fn = theano.function([input_var, target_var, mask_var, mask_var1, length], [loss_train, corr_train, num_tokens, prediction], on_unused_input='ignore')
Ejemplo n.º 2
0
    def __init__(self, params, data):

        self.get_pos_map(data)
        self.cap = params.cap
        self.lowercase = params.lowercase
        self.featuretype = params.featuretype

        chardim = params.chardim #dimension of character network layer
        worddim = params.worddim #dimension of character embedding and word LSTM layer

        if not params.nntype == "charagram":
            self.chars = self.get_character_dict(data)
            Ce = lasagne.init.Uniform(range=0.5/len(self.chars))
            Ce_np = Ce.sample((len(self.chars),params.worddim))
            Ce = theano.shared(np.asarray(Ce_np, dtype=config.floatX))

        char = T.imatrix(); charmask = T.matrix()
        word = T.imatrix(); wordmask = T.matrix()

        idxs = T.ivector()
        Y = T.matrix()

        l_in_char = lasagne.layers.InputLayer((None, None))
        if params.nntype == "charlstm":
            l_mask_char = lasagne.layers.InputLayer(shape=(None, None))
            l_emb_char = lasagne.layers.EmbeddingLayer(l_in_char, input_size=Ce.get_value().shape[0],
                                              output_size=Ce.get_value().shape[1], W=Ce)
            l_lstm_char = lasagne.layers.LSTMLayer(l_emb_char, chardim, peepholes=True, learn_init=False,
                                              mask_input=l_mask_char)
            if not params.outgate:
                l_lstm_char = lasagne_lstm_nooutput(l_emb_char, chardim, peepholes=True, learn_init=False,
                                                   mask_input=l_mask_char)
            l_We = lasagne.layers.SliceLayer(l_lstm_char, -1, 1)
            We = lasagne.layers.get_output(l_We, {l_in_char: char, l_mask_char: charmask})
        elif params.nntype == "charagram":
            char = T.matrix()
            self.featuremap = self.get_feature_map(data, params.featuretype, params.cutoff, params.lowercase)
            print "Number of features: ", len(self.featuremap)

            l_in_char = lasagne.layers.InputLayer((None, len(self.featuremap)+1))
            if self.cap:
                l_in_char = lasagne.layers.InputLayer((None, len(self.featuremap)+2))
            l_1 = lasagne.layers.DenseLayer(l_in_char, chardim, nonlinearity=params.act)
            if params.numlayers == 1:
                l_We = lasagne.layers.DenseLayer(l_in_char, chardim, nonlinearity=params.act)
            elif params.numlayers == 2:
                l_We = lasagne.layers.DenseLayer(l_1, chardim, nonlinearity=params.act)
            else:
                raise ValueError('Only 1-2 layers are supported currently.')
            We = lasagne.layers.get_output(l_We, {l_in_char:char})
        elif params.nntype == "charcnn":
            l_emb_char = lasagne.layers.EmbeddingLayer(l_in_char, input_size=Ce.get_value().shape[0],
                                              output_size=Ce.get_value().shape[1], W=Ce)
            emb = lasagne.layers.DimshuffleLayer(l_emb_char, (0, 2, 1))
            conv_params = None
            if params.conv_type == 1:
                conv_params = [(175,2),(175,3),(175,4)]
            else:
                conv_params = [(25,1),(50,2),(75,3),(100,4),(125,5),(150,6)]
            layers = []
            for num_filters, filter_size in conv_params:
                conv = lasagne.layers.Conv1DLayer(emb, num_filters, filter_size, nonlinearity=params.act)
                pl = lasagne.layers.GlobalPoolLayer(conv,theano.tensor.max)
                pl = lasagne.layers.FlattenLayer(pl)
                layers.append(pl)
            concat = lasagne.layers.ConcatLayer(layers)
            l_We = lasagne.layers.DenseLayer(concat, num_units=chardim, nonlinearity=params.act)
            We = lasagne.layers.get_output(l_We, {l_in_char: char})
        else:
            l_We = None
            We = None

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))
        l_emb_word = lasagne_embedding_layer_2(l_in_word, chardim, We)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, worddim, peepholes=True, learn_init=False,
                                              mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, worddim, peepholes=True, learn_init=False,
                                              mask_input=l_mask_word, backwards = True)

        l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf,(-1,worddim))
        l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb,(-1,worddim))
        concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb])
        l_emb = lasagne.layers.DenseLayer(concat2, num_units=worddim, nonlinearity=lasagne.nonlinearities.tanh)
        l_out = lasagne.layers.DenseLayer(l_emb, num_units=len(self.tags), nonlinearity=lasagne.nonlinearities.softmax)
        embg = lasagne.layers.get_output(l_out, {l_in_word: word, l_mask_word: wordmask})

        embg = embg[idxs]
        prediction = T.argmax(embg, axis=1)

        self.all_params = lasagne.layers.get_all_params(l_out, trainable=True) + lasagne.layers.get_all_params(l_We, trainable=True)
        reg = 0.5*params.LC*sum(lasagne.regularization.l2(x) for x in self.all_params)

        cost = T.nnet.categorical_crossentropy(embg,Y)
        cost = T.mean(cost) + reg

        self.feedforward_function = None
        self.scoring_function = None
        self.cost_function = None
        self.train_function = None

        if params.nntype == "charlstm":
            self.feedforward_function = theano.function([char, charmask, word, wordmask, idxs], embg)
            self.scoring_function = theano.function([char, charmask, word, wordmask, idxs], prediction)
            self.cost_function = theano.function([char, charmask, word, wordmask, idxs, Y], cost)
            grads = theano.gradient.grad(cost, self.all_params)
            updates = lasagne.updates.momentum(grads, self.all_params, 0.2, momentum=0.95) #same as Ling et al.
            self.train_function = theano.function([char, charmask, word, wordmask, idxs, Y], cost, updates=updates)
        elif params.nntype == "charcnn" or params.nntype == "charagram":
            self.feedforward_function = theano.function([char, word, wordmask, idxs], embg)
            self.scoring_function = theano.function([char, word, wordmask, idxs], prediction)
            self.cost_function = theano.function([char, word, wordmask, idxs, Y], cost)
            grads = theano.gradient.grad(cost, self.all_params)
            updates = lasagne.updates.momentum(grads, self.all_params, 0.2, momentum=0.95) #same as Ling et al.
            self.train_function = theano.function([char, word, wordmask, idxs, Y], cost, updates=updates)
    def __init__(self, We_initial, char_embedd_table_initial, params):

        self.eta = params.eta
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden

        char_embedd_dim = params.char_embedd_dim
        char_dic_size = len(params.char_dic)
        char_embedd_table = theano.shared(char_embedd_table_initial)

        g = T.imatrix()
        gmask = T.fmatrix()
        y = T.ivector()
        idxs = T.ivector()
        length = T.iscalar()
        char_input_var = T.itensor3(name='char-inputs')

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        layer_char_input = lasagne.layers.InputLayer(shape=(None, None,
                                                            Max_Char_Length),
                                                     input_var=char_input_var,
                                                     name='char-input')

        layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char,
            input_size=char_dic_size,
            output_size=char_embedd_dim,
            W=char_embedd_table,
            name='char_embedding')

        layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding,
                                                    pattern=(0, 2, 1))

        # first get some necessary dimensions or parameters
        conv_window = 3
        num_filters = params.num_filters
        #_, sent_length, _ = incoming2.output_shape

        # dropout before cnn?
        if params.dropout:
            layer_char = lasagne.layers.DropoutLayer(layer_char, p=0.5)

    # construct convolution layer
        cnn_layer = lasagne.layers.Conv1DLayer(
            layer_char,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        _, _, pool_size = cnn_layer.output_shape

        # construct max pool layer
        pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer,
                                                   pool_size=pool_size)
        # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
        output_cnn_layer = lasagne.layers.reshape(pool_layer,
                                                  (-1, length, [1]))

        # finally, concatenate the two incoming layers together.
        l_emb_word = lasagne.layers.concat([output_cnn_layer, l_emb_word],
                                           axis=2)

        if params.dropout:
            l_emb_word = lasagne.layers.DropoutLayer(l_emb_word, p=0.5)

        if (params.inf == 0):
            l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                    hidden,
                                                    mask_input=l_mask_word)
            l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                    hidden,
                                                    mask_input=l_mask_word,
                                                    backwards=True)

            l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf,
                                                     (-1, hidden))
            l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb,
                                                     (-1, hidden))
            concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb])

        elif (params.inf == 1):
            l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1))
            l_cnn_1 = lasagne.layers.Conv1DLayer(l_cnn_input,
                                                 hidden,
                                                 1,
                                                 1,
                                                 pad='same')
            l_cnn_3 = lasagne.layers.Conv1DLayer(l_cnn_input,
                                                 hidden,
                                                 3,
                                                 1,
                                                 pad='same')
            l_cnn = lasagne.layers.ConcatLayer([l_cnn_1, l_cnn_3], axis=1)
            #l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 1, 1, pad = 'same')
            concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1))
            #concat2 = lasagne.layers.ConcatLayer([l_emb_word, concat2], axis =2)
            concat2 = lasagne.layers.ReshapeLayer(concat2, (-1, 2 * hidden))
        else:
            l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1))
            l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input,
                                               hidden,
                                               3,
                                               1,
                                               pad='same')
            concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1))
            concat2 = lasagne.layers.ReshapeLayer(concat2, (-1, hidden))
            concat2 = lasagne.layers.DenseLayer(concat2, num_units=hidden)

        if params.dropout:
            concat2 = lasagne.layers.DropoutLayer(concat2, p=0.5)

    #l_emb = lasagne.layers.DenseLayer(concat2, num_units=hidden, nonlinearity=lasagne.nonlinearities.tanh)
        l_out = lasagne.layers.DenseLayer(
            concat2,
            num_units=params.num_labels,
            nonlinearity=lasagne.nonlinearities.softmax)

        output = lasagne.layers.get_output(l_out, {
            l_in_word: g,
            l_mask_word: gmask,
            layer_char_input: char_input_var
        })

        output_1 = output[idxs]

        test_output = lasagne.layers.get_output(
            l_out, {
                l_in_word: g,
                l_mask_word: gmask,
                layer_char_input: char_input_var
            },
            deterministic=True)

        test_output_1 = test_output[idxs]

        model_params = lasagne.layers.get_all_params(l_out, trainable=True)
        self.model_p = lasagne.layers.get_all_params(l_out, trainable=True)

        reg = sum(lasagne.regularization.l2(x) for x in model_params)

        cost = lasagne.objectives.categorical_crossentropy(output_1, y)
        cost = T.mean(cost) + params.L2 * reg

        #pred = T.argmax(output_1, axis=1)
        final_pred = T.argmax(test_output_1, axis=1)

        self.acc_function = theano.function(
            [g, char_input_var, gmask, y, idxs, length],
            final_pred,
            on_unused_input='warn')

        #updates = lasagne.updates.adam(cost, model_params, self.eta)
        #from adam import adam
        #updates = adam(cost, model_params, self.eta)
        updates = lasagne.updates.sgd(cost, model_params, self.eta)
        updates = lasagne.updates.apply_momentum(updates,
                                                 model_params,
                                                 momentum=0.9)
        self.train_function = theano.function(
            [g, char_input_var, gmask, y, idxs, length],
            cost,
            updates=updates,
            on_unused_input='warn')
Ejemplo n.º 4
0
    def __init__(self, We_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden

        self.num_labels = params.num_labels
        self.de_hidden_size = params.de_hidden_size
        self.en_hidden_size = params.en_hidden_size

        print params.de_hidden_size, hidden, params.num_labels

        self.lstm_layers_num = 1

        input_var = T.imatrix(name='inputs')
        target_var = T.imatrix(name='targets')
        target_var_in = T.imatrix(name='in_targets')
        mask_var = T.fmatrix(name='masks')
        mask_var1 = T.fmatrix(name='masks1')
        length = T.iscalar()
        length0 = T.iscalar()
        t_t = T.fscalar()
        t_t0 = T.fscalar()

        Wyy0 = np.random.uniform(
            -0.02, 0.02,
            (self.num_labels + 1, self.num_labels + 1)).astype('float32')
        Wyy = theano.shared(Wyy0)

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                512,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                512,
                                                mask_input=l_mask_word,
                                                backwards=True)

        concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)

        l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * 512))

        l_local = lasagne.layers.DenseLayer(
            l_reshape_concat,
            num_units=self.num_labels,
            nonlinearity=lasagne.nonlinearities.linear)

        network_params = lasagne.layers.get_all_params(l_local, trainable=True)
        network_params.append(Wyy)

        print len(network_params)
        f = open(
            'ccctag_CRF_Bilstm_Viterbi_.Batchsize_10_dropout_0_LearningRate_0.01_0.0512_tagversoin_2.pickle',
            'r')
        data = pickle.load(f)
        f.close()

        for idx, p in enumerate(network_params):
            #print data[idx].shape
            p.set_value(data[idx])

        self.params = []
        self.hos = []
        self.Cos = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []

        ei, di, dt = T.imatrices(3)  #place holders
        decoderInputs0, em, em1, dm, tf, di0 = T.fmatrices(6)

        #### the last one is for the stary symbole
        self.de_lookuptable = theano.shared(name="Decoder LookUpTable",
                                            value=init_xavier_uniform(
                                                self.num_labels + 1,
                                                self.de_hidden_size),
                                            borrow=True)

        self.linear = theano.shared(
            name="Linear",
            value=init_xavier_uniform(
                self.de_hidden_size + 2 * self.en_hidden_size,
                self.num_labels),
            borrow=True)
        self.linear_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.num_labels, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)
        #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True)

        #self.hidden_bias = theano.shared(
        #        name="Hidden to Bias",
        #        value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) ,
        #        borrow=True
        #        )

        input_var_shuffle = input_var.dimshuffle(1, 0)
        mask_var_shuffle = mask_var.dimshuffle(1, 0)
        target_var_in_shuffle = target_var_in.dimshuffle(1, 0)
        target_var_shuffle = target_var.dimshuffle(1, 0)

        self.params += [self.linear, self.linear_bias,
                        self.de_lookuptable]  #concatenate
        state_below = We[input_var_shuffle.flatten()].reshape(
            (input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize))
        enclstm_f = LSTM(embsize, self.en_hidden_size)
        enclstm_b = LSTM(embsize, self.en_hidden_size, True)
        self.encoder_lstm_layers.append(enclstm_f)  #append
        self.encoder_lstm_layers.append(enclstm_b)  #append
        self.params += enclstm_f.params + enclstm_b.params  #concatenate

        hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle)
        hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle)

        hs = T.concatenate([hs_f, hs_b], axis=2)
        Cs = T.concatenate([Cs_f, Cs_b], axis=2)

        hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1)
        Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1)
        #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
        #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
        self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX),
                            input_var_shuffle.shape[1], self.de_hidden_size),
        self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX),
                            input_var_shuffle.shape[1], self.de_hidden_size),

        Encoder = hs

        ei, di, dt = T.imatrices(3)  #place holders
        em, dm, tf, di0 = T.fmatrices(4)
        self.encoder_function = theano.function(inputs=[ei, em],
                                                outputs=Encoder,
                                                givens={
                                                    input_var: ei,
                                                    mask_var: em
                                                })

        state_below = self.de_lookuptable[
            target_var_in_shuffle.flatten()].reshape(
                (target_var_in_shuffle.shape[0],
                 target_var_in_shuffle.shape[1], self.de_hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.de_hidden_size, self.de_hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, mask_var_shuffle,
                                              ho, Co)

        decoder_lstm_outputs = T.concatenate([Encoder, state_below], axis=2)

        linear_outputs = T.dot(decoder_lstm_outputs,
                               self.linear) + self.linear_bias[None, None, :]
        softmax_outputs, updates = theano.scan(
            fn=lambda x: T.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * T.log(pred[T.arange(input_var.shape[0]), y])

        def _step2(ctx_, state_, hs_, Cs_):

            #print ctx_.shape, state_.shape, hs_.shape, Cs_.shape

            hs, Cs = [], []
            token_idxs = T.cast(state_.argmax(axis=-1), "int32")
            msk_ = T.fill((T.zeros_like(token_idxs, dtype="float32")), 1)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, ctx_.shape[0], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs)
            state_below0 = state_below0.reshape(
                (ctx_.shape[0], self.de_hidden_size))
            state_below0 = T.concatenate([ctx_, state_below0], axis=1)

            newpred = T.dot(state_below0,
                            self.linear) + self.linear_bias[None, :]
            state_below = T.nnet.softmax(newpred)

            extra_p = T.zeros_like(hs[:, :, 0])
            state_below = T.concatenate([state_below, extra_p.T], axis=1)

            return state_below, hs, Cs

        ctx_0, state_0 = T.fmatrices(2)
        hs_0 = T.ftensor3()
        Cs_0 = T.ftensor3()
        state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0)
        self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0],
                                      [state_below_tmp, hs_tmp, Cs_tmp],
                                      name='f_next')

        hs0, Cs0 = T.as_tensor_variable(
            self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0")
        train_outputs, _ = theano.scan(fn=_step2,
                                       sequences=[Encoder],
                                       outputs_info=[decoderInputs0, hs0, Cs0],
                                       n_steps=input_var_shuffle.shape[0])

        predy = train_outputs[0].dimshuffle(1, 0, 2)
        predy = predy[:, :, :-1] * mask_var[:, :, None]
        predy0 = predy.reshape((-1, self.num_labels))

        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy_t = tg_energy + T.sum(
                new_ta_energy * targets_one_step, axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy)

            return [targets_one_step, tg_energy_t]

        local_energy = lasagne.layers.get_output(l_local, {
            l_in_word: input_var,
            l_mask_word: mask_var
        })
        local_energy = local_energy.reshape((-1, length, self.num_labels))
        local_energy = local_energy * mask_var[:, :, None]

        #####################
        # for the end symbole of a sequence
        ####################

        end_term = Wyy[:-1, -1]
        local_energy = local_energy + end_term.dimshuffle(
            'x', 'x', 0) * mask_var1[:, :, None]

        #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var})

        predy_in = T.argmax(predy0, axis=1)
        A = T.extra_ops.to_one_hot(predy_in, self.num_labels)
        A = A.reshape((-1, length, self.num_labels))

        #predy = predy0.reshape((-1, length, 25))
        #predy = predy*mask_var[:,:,None]

        targets_shuffled = predy.dimshuffle(1, 0, 2)
        target_time0 = targets_shuffled[0]

        masks_shuffled = mask_var.dimshuffle(1, 0)

        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])

        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])
        cost11 = target_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * mask_var, axis=1)

        # compute the ground-truth energy

        targets_shuffled0 = A.dimshuffle(1, 0, 2)
        target_time00 = targets_shuffled0[0]

        initial_energy00 = T.dot(target_time00, Wyy[-1, :-1])

        initials0 = [target_time00, initial_energy00]
        [_, target_energies0], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials0,
            sequences=[targets_shuffled0[1:], masks_shuffled[1:]])
        cost110 = target_energies0[-1] + T.sum(
            T.sum(local_energy * A, axis=2) * mask_var, axis=1)

        #predy_f =  predy.reshape((-1, 25))
        y_f = target_var.flatten()

        if (params.annealing == 0):
            lamb = params.L3
        elif (params.annealing == 1):
            lamb = params.L3 * (1 - 0.01 * t_t)

        if (params.regutype == 0):
            ce_hinge = lasagne.objectives.categorical_crossentropy(
                predy0 + eps, y_f)
            ce_hinge = ce_hinge.reshape((-1, length))
            ce_hinge = T.sum(ce_hinge * mask_var, axis=1)
            cost = T.mean(-cost11) + lamb * T.mean(ce_hinge)
        else:

            entropy_term = -T.sum(predy0 * T.log(predy0 + eps), axis=1)
            entropy_term = entropy_term.reshape((-1, length))
            entropy_term = T.sum(entropy_term * mask_var, axis=1)
            cost = T.mean(-cost11) - lamb * T.mean(entropy_term)
        """
		f = open('F0_simple.pickle')
                PARA = pickle.load(f)
                f.close()
                l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params))


                cost = T.mean(-cost11) + params.L2*l2_term
		"""

        ##from adam import adam
        ##updates_a = adam(cost, self.params, params.eta)

        #updates_a = lasagne.updates.sgd(cost, self.params, params.eta)
        #updates_a = lasagne.updates.apply_momentum(updates_a, self.params, momentum=0.9)

        from momentum import momentum
        updates_a = momentum(cost, self.params, params.eta, momentum=0.9)

        if (params.regutype == 0):
            self.train_fn = theano.function(
                inputs=[ei, dt, em, em1, length0, t_t0, di0],
                outputs=[cost, ce_hinge],
                updates=updates_a,
                on_unused_input='ignore',
                givens={
                    input_var: ei,
                    target_var: dt,
                    mask_var: em,
                    mask_var1: em1,
                    length: length0,
                    t_t: t_t0,
                    decoderInputs0: di0
                })
            #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, ce_hinge], updates = updates_a, on_unused_input='ignore')
        else:

            self.train_fn = theano.function(
                inputs=[ei, dt, em, em1, length0, t_t0, di0],
                outputs=[cost, entropy_term],
                updates=updates_a,
                on_unused_input='ignore',
                givens={
                    input_var: ei,
                    target_var: dt,
                    mask_var: em,
                    mask_var1: em1,
                    length: length0,
                    t_t: t_t0,
                    decoderInputs0: di0
                })
            #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore')

        prediction = T.argmax(predy, axis=2)
        corr = T.eq(prediction, target_var)
        corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        num_tokens = mask_var.sum(dtype=theano.config.floatX)

        self.eval_fn = theano.function(
            inputs=[ei, dt, em, em1, length0, di0],
            outputs=[cost11, cost110, corr_train, num_tokens, prediction],
            on_unused_input='ignore',
            givens={
                input_var: ei,
                target_var: dt,
                mask_var: em,
                mask_var1: em1,
                length: length0,
                decoderInputs0: di0
            })
    def __init__(self, We_initial, char_embedd_table_initial, params):
        #self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)

        embsize = We_initial.shape[1]
        hidden = params.hidden

        input_init = np.random.uniform(-0.1, 0.1,
                                       (10, MAX_lENGTH, 25)).astype('float32')
        self.input_init = theano.shared(input_init)

        input_var = T.imatrix(name='inputs')
        target_var = T.imatrix(name='targets')
        mask_var = T.fmatrix(name='masks')
        mask_var1 = T.fmatrix(name='masks1')
        length = T.iscalar()
        t_t = T.fscalar()

        Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32')
        Wyy = theano.shared(Wyy0)

        char_input_var = T.itensor3()

        char_embedd_dim = params.char_embedd_dim
        char_dic_size = len(params.char_dic)
        char_embedd_table = theano.shared(char_embedd_table_initial)

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        layer_char_input = lasagne.layers.InputLayer(shape=(None, None,
                                                            Max_Char_Length),
                                                     input_var=char_input_var,
                                                     name='char-input')

        layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char,
            input_size=char_dic_size,
            output_size=char_embedd_dim,
            W=char_embedd_table,
            name='char_embedding')

        layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding,
                                                    pattern=(0, 2, 1))

        # first get some necessary dimensions or parameters
        conv_window = 3
        num_filters = params.num_filters

        # construct convolution layer
        cnn_layer = lasagne.layers.Conv1DLayer(
            layer_char,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        _, _, pool_size = cnn_layer.output_shape

        # construct max pool layer
        pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer,
                                                   pool_size=pool_size)
        # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
        output_cnn_layer = lasagne.layers.reshape(pool_layer,
                                                  (-1, length, [1]))

        # finally, concatenate the two incoming layers together.
        incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word],
                                         axis=2)

        l_lstm_wordf = lasagne.layers.LSTMLayer(incoming,
                                                hidden,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(incoming,
                                                hidden,
                                                mask_input=l_mask_word,
                                                backwards=True)

        concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)

        l_reshape_concat = lasagne.layers.ReshapeLayer(concat,
                                                       (-1, 2 * hidden))

        l_local = lasagne.layers.DenseLayer(
            l_reshape_concat,
            num_units=25,
            nonlinearity=lasagne.nonlinearities.linear)

        network_params = lasagne.layers.get_all_params(l_local, trainable=True)
        network_params.append(Wyy)

        #print len(network_params)
        f = open(
            'POS_Bilstm_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.002_0.030_emb_1_hidden_100.pickle',
            'r')
        data = pickle.load(f)
        f.close()

        for idx, p in enumerate(network_params):

            p.set_value(data[idx])

        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy_t = tg_energy + T.sum(
                new_ta_energy * targets_one_step, axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy)

            return [targets_one_step, tg_energy_t]

        local_energy = lasagne.layers.get_output(
            l_local, {
                l_in_word: input_var,
                l_mask_word: mask_var,
                layer_char_input: char_input_var
            })
        local_energy = local_energy.reshape((-1, length, 25))
        local_energy = local_energy * mask_var[:, :, None]

        #####################
        # for the end symbole of a sequence
        ####################

        end_term = Wyy[:-1, -1]
        local_energy = local_energy + end_term.dimshuffle(
            'x', 'x', 0) * mask_var1[:, :, None]

        predy_init = self.input_init[:, :length, :]

        a_params = [self.input_init]

        predy = T.nnet.softmax(predy_init.reshape((-1, 25)))
        predy = predy.reshape((-1, length, 25))

        prediction = T.argmax(predy_init, axis=2)

        predy = predy * mask_var[:, :, None]

        targets_shuffled = predy.dimshuffle(1, 0, 2)
        target_time0 = targets_shuffled[0]

        masks_shuffled = mask_var.dimshuffle(1, 0)

        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])

        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])
        cost11 = target_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * mask_var, axis=1)

        predy_f = predy.reshape((-1, 25))
        y_f = target_var.flatten()

        if (params.annealing == 0):
            lamb = params.L3
        elif (params.annealing == 1):
            lamb = params.L3 * (1 - 0.01 * t_t)

        #if (params.regutype==0):
        #        ce_hinge = lasagne.objectives.categorical_crossentropy(predy_f + eps, y_f)
        #        ce_hinge = ce_hinge.reshape((-1, length))
        #        ce_hinge = T.sum(ce_hinge* mask_var, axis=1)
        #	cost = T.mean(-cost11) + lamb*T.mean(ce_hinge)
        #else:

        cost = T.mean(-cost11)

        #from adam import adam
        #updates_a = adam(cost, a_params, params.eta)

        updates_a = lasagne.updates.sgd(cost, a_params, params.eta)
        updates_a = lasagne.updates.apply_momentum(updates_a,
                                                   a_params,
                                                   momentum=0.9)
        #gy = T.grad(cost, predy_init)

        #if (params.regutype==0):
        self.inf_fn = theano.function(
            [input_var, char_input_var, mask_var, mask_var1, length],
            cost,
            updates=updates_a)
        #else:
        #	self.train_fn = theano.function([input_var, char_input_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore')

        #corr = T.eq(prediction, target_var)
        #corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        #num_tokens = mask_var.sum(dtype=theano.config.floatX)

        self.eval_fn = theano.function(
            [input_var, char_input_var, mask_var, mask_var1, length],
            [prediction, -cost11],
            on_unused_input='ignore')

        if params.WarmStart:
            hidden_inf = params.hidden_inf
            We_inf = theano.shared(We_initial)
            char_embedd_table_inf = theano.shared(char_embedd_table_initial)

            l_in_word_a = lasagne.layers.InputLayer((None, None))
            l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None))

            l_emb_word_a = lasagne.layers.EmbeddingLayer(
                l_in_word_a,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We_inf,
                name='inf_word_embedding')

            layer_char_input_a = lasagne.layers.InputLayer(
                shape=(None, None, Max_Char_Length),
                input_var=char_input_var,
                name='char-input')

            layer_char_a = lasagne.layers.reshape(layer_char_input_a,
                                                  (-1, [2]))
            layer_char_embedding_a = lasagne.layers.EmbeddingLayer(
                layer_char_a,
                input_size=char_dic_size,
                output_size=char_embedd_dim,
                W=char_embedd_table_inf,
                name='char_embedding')

            layer_char_a = lasagne.layers.DimshuffleLayer(
                layer_char_embedding_a, pattern=(0, 2, 1))

            # first get some necessary dimensions or parameters
            conv_window = 3
            num_filters = params.num_filters
            #_, sent_length, _ = incoming2.output_shape

            # construct convolution layer
            cnn_layer_a = lasagne.layers.Conv1DLayer(
                layer_char_a,
                num_filters=num_filters,
                filter_size=conv_window,
                pad='full',
                nonlinearity=lasagne.nonlinearities.tanh,
                name='cnn')
            # infer the pool size for pooling (pool size should go through all time step of cnn)
            #_, _, pool_size = cnn_layer.output_shape

            # construct max pool layer
            pool_layer_a = lasagne.layers.MaxPool1DLayer(cnn_layer_a,
                                                         pool_size=pool_size)
            # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
            output_cnn_layer_a = lasagne.layers.reshape(
                pool_layer_a, (-1, length, [1]))

            # finally, concatenate the two incoming layers together.
            l_emb_word_a = lasagne.layers.concat(
                [output_cnn_layer_a, l_emb_word_a], axis=2)

            l_cnn_input_a = lasagne.layers.DimshuffleLayer(
                l_emb_word_a, (0, 2, 1))
            l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a,
                                                   hidden_inf,
                                                   1,
                                                   1,
                                                   pad='same')
            l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a,
                                                   hidden_inf,
                                                   3,
                                                   1,
                                                   pad='same')
            l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a],
                                                 axis=1)
            concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1))
            concat2_a = lasagne.layers.ReshapeLayer(concat2_a,
                                                    (-1, 2 * hidden_inf))

            ## output logit scores before the softmax operations, not probability
            l_local_a = lasagne.layers.DenseLayer(
                concat2_a,
                num_units=25,
                nonlinearity=lasagne.nonlinearities.linear)
            predy_inf = lasagne.layers.get_output(
                l_local_a, {
                    l_in_word_a: input_var,
                    l_mask_word_a: mask_var,
                    layer_char_input_a: char_input_var
                })
            predy_inf = predy_inf.reshape((-1, length, 25))

            a_params = lasagne.layers.get_all_params(l_local_a, trainable=True)

            f = open(
                'CRF_Inf_POS_num_filters_30_dropout_1_LearningRate_0.002_1.0_emb_1_inf_1_hidden_inf_300.pickle',
                'r')
            data = pickle.load(f)
            f.close()

            for idx, p in enumerate(a_params):
                p.set_value(data[idx])

            self.start_fn = theano.function(
                [input_var, char_input_var, mask_var, length],
                predy_inf,
                on_unused_input='ignore')
Ejemplo n.º 6
0
    def __init__(self, We_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden
        start0 = np.random.uniform(-0.02, 0.02, (1, 25)).astype('float32')
        end0 = np.random.uniform(-0.02, 0.02, (1, 25)).astype('float32')
        start = theano.shared(start0)
        end = theano.shared(end0)

        input_var = T.imatrix(name='inputs')
        target_var = T.imatrix(name='targets')
        mask_var = T.fmatrix(name='masks')
        mask_var1 = T.fmatrix(name='masks1')
        length = T.iscalar()

        Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32')
        Wyy = theano.shared(Wyy0)

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word,
                                                backwards=True)
        concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)
        l_reshape_concat = lasagne.layers.ReshapeLayer(concat,
                                                       (-1, 2 * hidden))
        l_local = lasagne.layers.DenseLayer(
            l_reshape_concat,
            num_units=25,
            nonlinearity=lasagne.nonlinearities.linear)

        network_params = lasagne.layers.get_all_params(l_local, trainable=True)
        network_params.append(Wyy)

        f = open(
            'LF_LIFU_Simple_CRF_lstm_pretrain.Batchsize_10_dropout_0_LearningRate_0.1_1e-050_emb_0.pickle',
            'r')
        data = pickle.load(f)
        f.close()

        for idx, p in enumerate(network_params):
            p.set_value(data[idx])

        l_in_word_a = lasagne.layers.InputLayer((None, None))
        l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None))
        l_emb_word_a = lasagne_embedding_layer_2(l_in_word_a, embsize, We)
        if params.dropout:
            l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5)
        l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                  hidden,
                                                  mask_input=l_mask_word_a)
        l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                  hidden,
                                                  mask_input=l_mask_word_a,
                                                  backwards=True)
        l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a,
                                                   (-1, hidden))
        l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a,
                                                   (-1, hidden))
        concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a])
        l_local_a = lasagne.layers.DenseLayer(
            concat2_a,
            num_units=25,
            nonlinearity=lasagne.nonlinearities.softmax)
        a_params = lasagne.layers.get_all_params(l_local_a, trainable=True)
        self.a_params = a_params

        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy_t = tg_energy + T.sum(
                new_ta_energy * targets_one_step, axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy)
            return [targets_one_step, tg_energy_t]

        local_energy = lasagne.layers.get_output(l_local, {
            l_in_word: input_var,
            l_mask_word: mask_var
        })
        local_energy = local_energy.reshape((-1, length, 25))
        local_energy = local_energy * mask_var[:, :, None]
        #####################
        # for the end symbole of a sequence
        ####################
        end_term = Wyy[:-1, -1]
        local_energy = local_energy + end_term.dimshuffle(
            'x', 'x', 0) * mask_var1[:, :, None]

        predy0 = lasagne.layers.get_output(l_local_a, {
            l_in_word_a: input_var,
            l_mask_word_a: mask_var
        })
        predy_in = T.argmax(predy0, axis=1)
        A = T.extra_ops.to_one_hot(predy_in, 25)
        A = A.reshape((-1, length, 25))
        predy = predy0.reshape((-1, length, 25))
        predy = predy * mask_var[:, :, None]

        targets_shuffled = predy.dimshuffle(1, 0, 2)
        target_time0 = targets_shuffled[0]
        masks_shuffled = mask_var.dimshuffle(1, 0)
        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])

        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])
        cost11 = target_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * mask_var, axis=1)

        targets_shuffled0 = A.dimshuffle(1, 0, 2)
        target_time00 = targets_shuffled0[0]

        initial_energy00 = T.dot(target_time00, Wyy[-1, :-1])
        initials0 = [target_time00, initial_energy00]
        [_, target_energies0], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials0,
            sequences=[targets_shuffled0[1:], masks_shuffled[1:]])
        cost110 = target_energies0[-1] + T.sum(
            T.sum(local_energy * A, axis=2) * mask_var, axis=1)
        predy_f = predy.reshape((-1, 25))
        y_f = target_var.flatten()

        ce_hinge = lasagne.objectives.categorical_crossentropy(predy_f, y_f)
        ce_hinge = ce_hinge.reshape((-1, length))
        ce_hinge = T.sum(ce_hinge * mask_var, axis=1) / mask_var.sum(axis=1)

        entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1)
        entropy_term = entropy_term.reshape((-1, length))
        entropy_term = T.sum(entropy_term * mask_var, axis=1)
        """
                label sequence language model score for each sequence
                """
        l_LM_in = lasagne.layers.InputLayer((None, None, 25))
        l_LM_mask = lasagne.layers.InputLayer(shape=(None, None))
        l_LM_lstm = lasagne.layers.LSTMLayer(l_LM_in,
                                             hidden,
                                             mask_input=l_LM_mask)
        l_reshape_LM = lasagne.layers.ReshapeLayer(l_LM_lstm, (-1, hidden))
        l_LM = lasagne.layers.DenseLayer(
            l_reshape_LM,
            num_units=26,
            nonlinearity=lasagne.nonlinearities.softmax)
        LM_params = lasagne.layers.get_all_params(l_LM, trainable=True)
        LM_params.append(start)
        LM_params.append(end)
        f = open('Label_LM.pickle', 'r')
        data = pickle.load(f)
        f.close()
        for idx, p in enumerate(LM_params):
            p.set_value(data[idx])

        predy_tmp = predy[:, 0, :].reshape((-1, 1, 25))
        tmp = T.ones_like(predy_tmp)
        sos = tmp * (start.dimshuffle('x', 0, 1))
        eos = tmp * (end.dimshuffle('x', 0, 1))
        y_lm_in = T.concatenate([sos, predy], axis=1)
        y_lm_out = T.concatenate([predy, eos], axis=1)
        lm_mask_var = T.concatenate([tmp[:, 0, 0].reshape((-1, 1)), mask_var],
                                    axis=1)
        LM_out = lasagne.layers.get_output(l_LM, {
            l_LM_in: y_lm_in,
            l_LM_mask: lm_mask_var
        })
        LM_out = LM_out.reshape((-1, length + 1, 26))
        LM_cost = T.sum(T.log(
            T.sum(LM_out[:, :-1, :-1] * y_lm_out[:, :-1, :], axis=2) + eps) *
                        mask_var,
                        axis=1)
        cost = T.mean(-cost11) - params.l3 * T.mean(
            entropy_term) - params.lm * T.mean(LM_cost)

        updates_a = lasagne.updates.sgd(cost, a_params, params.eta)
        updates_a = lasagne.updates.apply_momentum(updates_a,
                                                   a_params,
                                                   momentum=0.9)
        self.train_fn = theano.function(
            [input_var, target_var, mask_var, mask_var1, length],
            [cost, T.mean(entropy_term),
             T.mean(LM_cost)],
            updates=updates_a,
            on_unused_input='ignore')

        prediction = T.argmax(predy, axis=2)
        corr = T.eq(prediction, target_var)
        corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        num_tokens = mask_var.sum(dtype=theano.config.floatX)

        self.eval_fn = theano.function(
            [input_var, target_var, mask_var, mask_var1, length],
            [cost11, cost110, corr_train, num_tokens, prediction],
            on_unused_input='ignore')
    def __init__(self, We_initial, char_embedd_table_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        We_inf = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden

        hidden_inf = params.hidden_inf

        input_var = T.imatrix(name='inputs')
        target_var = T.imatrix(name='targets')
        mask_var = T.fmatrix(name='masks')
        mask_var1 = T.fmatrix(name='masks1')
        length = T.iscalar()
        t_t = T.fscalar()

        Wyy0 = np.random.uniform(-0.02, 0.02, (18, 18)).astype('float32')
        Wyy = theano.shared(Wyy0)

        char_input_var = T.itensor3()

        char_embedd_dim = params.char_embedd_dim
        char_dic_size = len(params.char_dic)
        char_embedd_table = theano.shared(char_embedd_table_initial)
        char_embedd_table_inf = theano.shared(char_embedd_table_initial)

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We,
                name='word_embedding')
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        layer_char_input = lasagne.layers.InputLayer(shape=(None, None,
                                                            Max_Char_Length),
                                                     input_var=char_input_var,
                                                     name='char-input')

        layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char,
            input_size=char_dic_size,
            output_size=char_embedd_dim,
            W=char_embedd_table,
            name='char_embedding')

        layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding,
                                                    pattern=(0, 2, 1))

        # first get some necessary dimensions or parameters
        conv_window = 3
        num_filters = params.num_filters

        # construct convolution layer
        cnn_layer = lasagne.layers.Conv1DLayer(
            layer_char,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        _, _, pool_size = cnn_layer.output_shape

        # construct max pool layer
        pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer,
                                                   pool_size=pool_size)
        # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
        output_cnn_layer = lasagne.layers.reshape(pool_layer,
                                                  (-1, length, [1]))

        # finally, concatenate the two incoming layers together.
        incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word],
                                         axis=2)

        l_lstm_wordf = lasagne.layers.LSTMLayer(incoming,
                                                hidden,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(incoming,
                                                hidden,
                                                mask_input=l_mask_word,
                                                backwards=True)

        concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)

        l_reshape_concat = lasagne.layers.ReshapeLayer(concat,
                                                       (-1, 2 * hidden))

        l_local = lasagne.layers.DenseLayer(
            l_reshape_concat,
            num_units=17,
            nonlinearity=lasagne.nonlinearities.linear)

        network_params = lasagne.layers.get_all_params(l_local, trainable=True)
        network_params.append(Wyy)

        print len(network_params)
        f = open(
            'NER_BiLSTM_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.005_0.0_50_hidden_200.pickle',
            'r')
        data = pickle.load(f)
        f.close()

        for idx, p in enumerate(network_params):

            p.set_value(data[idx])

        l_in_word_a = lasagne.layers.InputLayer((None, None))
        l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None))

        l_emb_word_a = lasagne.layers.EmbeddingLayer(
            l_in_word_a,
            input_size=We_initial.shape[0],
            output_size=embsize,
            W=We_inf,
            name='inf_word_embedding')

        layer_char_input_a = lasagne.layers.InputLayer(
            shape=(None, None, Max_Char_Length),
            input_var=char_input_var,
            name='char-input')

        layer_char_a = lasagne.layers.reshape(layer_char_input_a, (-1, [2]))
        layer_char_embedding_a = lasagne.layers.EmbeddingLayer(
            layer_char_a,
            input_size=char_dic_size,
            output_size=char_embedd_dim,
            W=char_embedd_table_inf,
            name='char_embedding')

        layer_char_a = lasagne.layers.DimshuffleLayer(layer_char_embedding_a,
                                                      pattern=(0, 2, 1))

        # first get some necessary dimensions or parameters
        conv_window = 3
        num_filters = params.num_filters
        #_, sent_length, _ = incoming2.output_shape

        # dropout before cnn?
        if params.dropout:
            layer_char_a = lasagne.layers.DropoutLayer(layer_char_a, p=0.5)

# construct convolution layer
        cnn_layer_a = lasagne.layers.Conv1DLayer(
            layer_char_a,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        #_, _, pool_size = cnn_layer.output_shape

        # construct max pool layer
        pool_layer_a = lasagne.layers.MaxPool1DLayer(cnn_layer_a,
                                                     pool_size=pool_size)
        # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
        output_cnn_layer_a = lasagne.layers.reshape(pool_layer_a,
                                                    (-1, length, [1]))

        # finally, concatenate the two incoming layers together.
        l_emb_word_a = lasagne.layers.concat(
            [output_cnn_layer_a, l_emb_word_a], axis=2)

        if params.dropout:
            l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5)

        if (params.inf == 0):
            l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                      hidden_inf,
                                                      mask_input=l_mask_word_a)
            l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                      hidden_inf,
                                                      mask_input=l_mask_word_a,
                                                      backwards=True)

            l_emb_word_a1 = lasagne.layers.concat(
                [l_lstm_wordf_a, l_lstm_wordb_a], axis=2)

            l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a1,
                                                      hidden_inf,
                                                      mask_input=l_mask_word_a)
            l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a1,
                                                      hidden_inf,
                                                      mask_input=l_mask_word_a,
                                                      backwards=True)

            l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a,
                                                       (-1, hidden_inf))
            l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a,
                                                       (-1, hidden_inf))
            concat2_a = lasagne.layers.ConcatLayer(
                [l_reshapef_a, l_reshapeb_a])

        else:
            l_cnn_input_a = lasagne.layers.DimshuffleLayer(
                l_emb_word_a, (0, 2, 1))
            l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a,
                                                   hidden_inf,
                                                   3,
                                                   1,
                                                   pad='same')
            l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a,
                                                   hidden_inf,
                                                   1,
                                                   1,
                                                   pad='same')
            #l_cnn_a = lasagne.layers.Conv1DLayer(l_cnn_a, hidden, 3, 1, pad = 'same')
            l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a],
                                                 axis=1)
            concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1))
            concat2_a = lasagne.layers.ReshapeLayer(concat2_a,
                                                    (-1, 2 * hidden_inf))

        if params.dropout:
            concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5)

        l_local_a = lasagne.layers.DenseLayer(
            concat2_a,
            num_units=17,
            nonlinearity=lasagne.nonlinearities.softmax)

        a_params = lasagne.layers.get_all_params(l_local_a, trainable=True)
        self.a_params = a_params

        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy_t = tg_energy + T.sum(
                new_ta_energy * targets_one_step, axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy)

            return [targets_one_step, tg_energy_t]

        local_energy = lasagne.layers.get_output(
            l_local, {
                l_in_word: input_var,
                l_mask_word: mask_var,
                layer_char_input: char_input_var
            })
        local_energy = local_energy.reshape((-1, length, 17))
        local_energy = local_energy * mask_var[:, :, None]

        #####################
        # for the end symbole of a sequence
        ####################

        end_term = Wyy[:-1, -1]
        local_energy = local_energy + end_term.dimshuffle(
            'x', 'x', 0) * mask_var1[:, :, None]

        predy0 = lasagne.layers.get_output(
            l_local_a, {
                l_in_word_a: input_var,
                l_mask_word_a: mask_var,
                layer_char_input_a: char_input_var
            })

        predy_inf = lasagne.layers.get_output(
            l_local_a, {
                l_in_word_a: input_var,
                l_mask_word_a: mask_var,
                layer_char_input_a: char_input_var
            },
            deterministic=True)
        predy_inf = predy_inf.reshape((-1, length, 17))

        predy_in = T.argmax(predy0, axis=1)
        A = T.extra_ops.to_one_hot(predy_in, 17)
        A = A.reshape((-1, length, 17))

        predy = predy0.reshape((-1, length, 17))
        predy = predy * mask_var[:, :, None]

        targets_shuffled = predy.dimshuffle(1, 0, 2)
        target_time0 = targets_shuffled[0]

        masks_shuffled = mask_var.dimshuffle(1, 0)

        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])

        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])
        cost11 = target_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * mask_var, axis=1)

        # compute the ground-truth energy

        targets_shuffled0 = A.dimshuffle(1, 0, 2)
        target_time00 = targets_shuffled0[0]

        initial_energy00 = T.dot(target_time00, Wyy[-1, :-1])

        initials0 = [target_time00, initial_energy00]
        [_, target_energies0], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials0,
            sequences=[targets_shuffled0[1:], masks_shuffled[1:]])
        cost110 = target_energies0[-1] + T.sum(
            T.sum(local_energy * A, axis=2) * mask_var, axis=1)

        predy_f = predy.reshape((-1, 17))
        y_f = target_var.flatten()

        if (params.annealing == 0):
            lamb = params.L3
        elif (params.annealing == 1):
            lamb = params.L3 * (np.e)**(-0.01 * t_t)

        if (params.regutype == 0):
            ce_hinge = lasagne.objectives.categorical_crossentropy(
                predy_f + eps, y_f)
            ce_hinge = ce_hinge.reshape((-1, length))
            ce_hinge = T.sum(ce_hinge * mask_var, axis=1)
            cost = T.mean(-cost11) + lamb * T.mean(ce_hinge)
        else:

            entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1)
            entropy_term = entropy_term.reshape((-1, length))
            entropy_term = T.sum(entropy_term * mask_var, axis=1)
            cost = T.mean(-cost11) - lamb * T.mean(entropy_term)

        ###from adam import adam
        ###updates_a = adam(cost, a_params, params.eta)

        updates_a = lasagne.updates.sgd(cost, a_params, params.eta)
        updates_a = lasagne.updates.apply_momentum(updates_a,
                                                   a_params,
                                                   momentum=0.9)

        if (params.regutype == 0):
            self.train_fn = theano.function([
                input_var, char_input_var, target_var, mask_var, mask_var1,
                length, t_t
            ], [cost, ce_hinge],
                                            updates=updates_a,
                                            on_unused_input='ignore')
        else:
            self.train_fn = theano.function([
                input_var, char_input_var, target_var, mask_var, mask_var1,
                length, t_t
            ], [cost, entropy_term],
                                            updates=updates_a,
                                            on_unused_input='ignore')

        prediction = T.argmax(predy_inf, axis=2)
        corr = T.eq(prediction, target_var)
        corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        num_tokens = mask_var.sum(dtype=theano.config.floatX)

        self.eval_fn = theano.function([
            input_var, char_input_var, target_var, mask_var, mask_var1, length
        ], [corr_train, num_tokens, prediction],
                                       on_unused_input='ignore')
	def __init__(self,  We_initial, char_embedd_table_initial, params):

		We = theano.shared(We_initial)
 
                # initial embedding for the InfNet
                We_inf = theano.shared(We_initial)
        	embsize = We_initial.shape[1]
        	hidden = params.hidden
		self.en_hidden_size = params.hidden_inf
		self.num_labels = 17
		self.de_hidden_size = params.de_hidden_size
		

                char_embedd_dim = params.char_embedd_dim
                char_dic_size = len(params.char_dic)
                char_embedd_table = theano.shared(char_embedd_table_initial)
                char_embedd_table_inf = theano.shared(char_embedd_table_initial)


		input_var = T.imatrix(name='inputs')
        	target_var = T.imatrix(name='targets')
		target_var_in = T.imatrix(name='targets')
        	mask_var = T.fmatrix(name='masks')
		mask_var1 = T.fmatrix(name='masks1')
                char_input_var = T.itensor3(name='char-inputs')

		length = T.iscalar()
		length0 = T.iscalar()
		t_t = T.fscalar()
		t_t0 = T.fscalar()		

                use_dropout = T.fscalar()
                use_dropout0 = T.fscalar()

		Wyy0 = np.random.uniform(-0.02, 0.02, (self.num_labels +1 , self.num_labels + 1)).astype('float32')
                Wyy = theano.shared(Wyy0)


                l_in_word = lasagne.layers.InputLayer((None, None))
                l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

		if params.emb ==1:
                        l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word,  input_size= We_initial.shape[0] , output_size = embsize, W =We, name='word_embedding')
                else:
                        l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

                layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ),
                                                     input_var=char_input_var, name='char-input')

                layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
                layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char, input_size=char_dic_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table,
                                                             name='char_embedding')

                layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1))


                # first get some necessary dimensions or parameters
                conv_window = 3
                num_filters = params.num_filters

                # construct convolution layer
                cnn_layer = lasagne.layers.Conv1DLayer(layer_char, num_filters=num_filters, filter_size=conv_window, pad='full',
                                           nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
                # infer the pool size for pooling (pool size should go through all time step of cnn)
                _, _, pool_size = cnn_layer.output_shape

                # construct max pool layer
                pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size)
                # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
                output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1]))

                # finally, concatenate the two incoming layers together.
                incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2)

           

		l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word)
        	l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, backwards = True)

        	concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)
		
		l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden))

		l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= self.num_labels + 1, nonlinearity=lasagne.nonlinearities.linear)

		
		network_params = lasagne.layers.get_all_params(l_local, trainable=True)
                network_params.append(Wyy)

		
		print len(network_params)
		f = open('NER_BiLSTM_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.005_0.0_50_hidden_200.pickle','r')
		data = pickle.load(f)
		f.close()

		for idx, p in enumerate(network_params):

                        p.set_value(data[idx])


		self.params = []
		self.hos = []
                self.Cos = []
		self.encoder_lstm_layers = []
                self.decoder_lstm_layers = []
		self.lstm_layers_num = 1		

		ei, di, dt = T.imatrices(3)    #place holders
                decoderInputs0 ,em, em1, dm, tf, di0 =T.fmatrices(6)
		ci = T.itensor3()

		#### the last one is for the stary symbole
                self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform(self.num_labels +1, self.de_hidden_size), borrow=True)

                self.linear = theano.shared(name="Linear", value = init_xavier_uniform(self.de_hidden_size + 2*self.en_hidden_size, self.num_labels), borrow= True)
		self.linear_bias = theano.shared(name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, )*0., dtype=theano.config.floatX), borrow=True)
                #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True)
		
                #self.hidden_bias = theano.shared(
                #        name="Hidden to Bias",
                #        value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) ,
                #        borrow=True
                #        )

       

		input_var_shuffle = input_var.dimshuffle(1, 0)
		mask_var_shuffle = mask_var.dimshuffle(1, 0)
		target_var_in_shuffle = target_var_in.dimshuffle(1,0)
		target_var_shuffle = target_var.dimshuffle(1,0)


		self.params += [We_inf, self.linear, self.de_lookuptable, self.linear_bias] 
                
                ######[batch, sent_length, embsize] 
		state_below = We_inf[input_var_shuffle.flatten()].reshape((input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize))
                
                ###### character word embedding
                layer_char_input_inf = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ),
                                                     input_var=char_input_var, name='char-input')
                layer_char_inf = lasagne.layers.reshape(layer_char_input_inf, (-1, [2]))
                layer_char_embedding_inf = lasagne.layers.EmbeddingLayer(layer_char_inf, input_size=char_dic_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table_inf,
                                                             name='char_embedding_inf')

                layer_char_inf = lasagne.layers.DimshuffleLayer(layer_char_embedding_inf, pattern=(0, 2, 1))
                #layer_char_inf = lasagne.layers.DropoutLayer(layer_char_inf, p=0.5)

                cnn_layer_inf = lasagne.layers.Conv1DLayer(layer_char_inf, num_filters=num_filters, filter_size=conv_window, pad='full',
                                           nonlinearity=lasagne.nonlinearities.tanh, name='cnn_inf')
               
                pool_layer_inf = lasagne.layers.MaxPool1DLayer(cnn_layer_inf, pool_size=pool_size)
                output_cnn_layer_inf = lasagne.layers.reshape(pool_layer_inf, (-1, length, [1]))
                char_params = lasagne.layers.get_all_params(output_cnn_layer_inf, trainable=True)
                self.params += char_params          
 
                ###### [batch, sent_length, num_filters]
                #char_state_below = lasagne.layers.get_output(output_cnn_layer_inf, {layer_char_input_inf:char_input_var})
                char_state_below = lasagne.layers.get_output(output_cnn_layer_inf)

       
                char_state_below = dropout_layer(char_state_below, use_dropout, trng)
                
                char_state_shuff = char_state_below.dimshuffle(1,0, 2) 
                state_below = T.concatenate([state_below, char_state_shuff], axis=2)
                
                state_below = dropout_layer(state_below, use_dropout, trng)

		enclstm_f = LSTM(embsize+num_filters, self.en_hidden_size)
                enclstm_b = LSTM(embsize+num_filters, self.en_hidden_size, True)
                self.encoder_lstm_layers.append(enclstm_f)    #append
                self.encoder_lstm_layers.append(enclstm_b)    #append
                self.params += enclstm_f.params + enclstm_b.params   #concatenate

                hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle)
                hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle)

                hs = T.concatenate([hs_f, hs_b], axis=2)
                Cs = T.concatenate([Cs_f, Cs_b], axis=2)

		hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1)
                Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1)
		#self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
                #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
                self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size),
                self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size),
		
		Encoder = hs
                	
		state_below = self.de_lookuptable[target_var_in_shuffle.flatten()].reshape((target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size))

		for i in range(self.lstm_layers_num):
                        declstm = LSTM(self.de_hidden_size, self.de_hidden_size)
                        self.decoder_lstm_layers += declstm,    #append
                        self.params += declstm.params    #concatenate
                        ho, Co = self.hos[i], self.Cos[i]
                        state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co)		
		

		decoder_lstm_outputs = T.concatenate([state_below, Encoder], axis=2)
		linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :]
                softmax_outputs, updates = theano.scan(
                        fn=lambda x: T.nnet.softmax(x),
                        sequences=[linear_outputs],
                        )

		def _NLL(pred, y, m):
                        return -m * T.log(pred[T.arange(input_var.shape[0]), y])

		"""
		costs, _ = theano.scan(fn=_NLL, sequences=[softmax_outputs, target_var_shuffle, mask_var_shuffle])
                #loss = costs.sum() / mask_var.sum() + params.L2*sum(lasagne.regularization.l2(x) for x in self.params)
		loss = costs.sum() / mask_var.sum()		

                updates = lasagne.updates.sgd(loss, self.params, self.eta)
                updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9)

		###################################################
                #### using the ground truth when training
                ##################################################
                self._train = theano.function(
                        inputs=[ei, em, di, dm, dt],
                        outputs=[loss, softmax_outputs],
                        updates=updates,
                        givens={input_var:ei, mask_var:em, target_var_in:di, decoderMask:dm, target_var:dt}
                        )
		"""
	

		def _step2(ctx_, state_, hs_, Cs_):

                        hs, Cs = [], []
                        token_idxs = T.cast(state_.argmax(axis=-1), "int32" )
                        msk_ = T.fill( (T.zeros_like(token_idxs, dtype="float32")), 1.)
                        msk_ = msk_.dimshuffle('x', 0)
                        state_below0 = self.de_lookuptable[token_idxs].reshape((1, ctx_.shape[0], self.de_hidden_size))
                        for i, lstm in enumerate(self.decoder_lstm_layers):
                                h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i])    #mind msk
                                hs += h[-1],
                                Cs += C[-1],
                                state_below0 = h

                        hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs)
			state_below0 = state_below0.reshape((ctx_.shape[0], self.de_hidden_size))
                        state_below0 = T.concatenate([ctx_, state_below0], axis =1)			

                        newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :]
                        state_below = T.nnet.softmax(newpred)
			##### the beging symbole probablity is 0
                        extra_p = T.zeros_like(hs[:,:,0])
                        state_below = T.concatenate([state_below, extra_p.T], axis=1)


                        return state_below, hs, Cs


		hs0, Cs0 = T.as_tensor_variable(self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0")

                train_outputs, _ = theano.scan(
                        fn=_step2,
			sequences = [Encoder],
                        outputs_info=[decoderInputs0, hs0, Cs0],
                        n_steps=input_var_shuffle.shape[0]
                        )

                predy = train_outputs[0].dimshuffle(1, 0 , 2)
		predy = predy[:,:,:-1]*mask_var[:,:,None]
		predy0 = predy.reshape((-1, 17))
          
 

	
		def inner_function( targets_one_step, mask_one_step,  prev_label, tg_energy):
                        """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """                 
                        new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1])
                        new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1)
			tg_energy_t = T.switch(mask_one_step, new_ta_energy_t,  tg_energy)

                        return [targets_one_step, tg_energy_t]


		local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var})
		local_energy = local_energy.reshape((-1, length, 17))
                local_energy = local_energy*mask_var[:,:,None]		

		#####################
		# for the end symbole of a sequence
		####################

		end_term = Wyy[:-1,-1]
                local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None]


		#predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var})

		predy_in = T.argmax(predy0, axis=1)
                A = T.extra_ops.to_one_hot(predy_in, 17)
                A = A.reshape((-1, length, 17))		

		#predy = predy0.reshape((-1, length, 25))
		#predy = predy*mask_var[:,:,None]

		
		targets_shuffled = predy.dimshuffle(1, 0, 2)
                target_time0 = targets_shuffled[0]
		
		masks_shuffled = mask_var.dimshuffle(1, 0)		 

                initial_energy0 = T.dot(target_time0, Wyy[-1,:-1])


                initials = [target_time0, initial_energy0]
                [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]])
                cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1)

		
                cost = T.mean(-cost11)		
  
				
		from momentum import momentum
                updates_a = momentum(cost, self.params, params.eta, momentum=0.9)

                self.train_fn = theano.function(
                                inputs=[ei, ci, em, em1, length0, di0, use_dropout0],
                                outputs=[cost],
                                updates=updates_a,
                                on_unused_input='ignore',
                                givens={input_var:ei, char_input_var:ci, mask_var:em, mask_var1:em1, length: length0, decoderInputs0:di0, use_dropout:use_dropout0}
                                )


	
		
		prediction = T.argmax(predy, axis=2)
		corr = T.eq(prediction, target_var)
        	corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        	num_tokens = mask_var.sum(dtype=theano.config.floatX)

		self.eval_fn = theano.function(
                                inputs=[ei, ci, em, em1, length0, di0, use_dropout0],
                                outputs=[prediction, -cost11],
				on_unused_input='ignore',
                                givens={input_var:ei, char_input_var:ci, mask_var:em, mask_var1:em1, length: length0, decoderInputs0:di0, use_dropout:use_dropout0}
                                )        	
Ejemplo n.º 9
0
    def __init__(self, We_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))
        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word,
                                                backwards=True)

        l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, hidden))
        l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, hidden))
        concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb])
        l_local = lasagne.layers.DenseLayer(
            concat2,
            num_units=25,
            b=None,
            nonlinearity=lasagne.nonlinearities.linear)
        ### the above is for the uniary term energy
        """
                if params.emb ==1:		
                        f = open('F.pickle')
                else:
                        f = open('F0_new.pickle')

                para = pickle.load(f)
                f.close()
                """
        f_params = lasagne.layers.get_all_params(l_local, trainable=True)
        """
                for idx, p in enumerate(f_params):
                        p.set_value(para[idx])
		"""
        Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32')
        Wyy = theano.shared(Wyy0)
        d_params = lasagne.layers.get_all_params(l_local, trainable=True)
        d_params.append(Wyy)
        self.d_params = d_params

        l_in_word_a = lasagne.layers.InputLayer((None, None))
        l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None))
        l_emb_word_a = lasagne_embedding_layer_2(l_in_word_a, embsize,
                                                 l_emb_word.W)
        #l_emb_word_a = lasagne.layers.EmbeddingLayer(l_in_word_a,  input_size=We_initial.shape[0] , output_size = embsize, W =We)
        if params.dropout:
            l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5)

        l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                  hidden,
                                                  mask_input=l_mask_word_a)
        l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                  hidden,
                                                  mask_input=l_mask_word_a,
                                                  backwards=True)
        l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a,
                                                   (-1, hidden))
        l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a,
                                                   (-1, hidden))
        concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a])
        if params.dropout:
            concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5)

        l_local_a = lasagne.layers.DenseLayer(
            concat2_a,
            num_units=25,
            nonlinearity=lasagne.nonlinearities.softmax)

        #a_params = lasagne.layers.get_all_params(l_local_a, trainable=True)
        #self.a_params = a_params
        """		
                if params.emb ==1:	
                        f = open('F.pickle')
                else:
                        f = open('F0_new.pickle')
                PARA = pickle.load(f)
                f.close()
               
                for idx, p in enumerate(a_params):
                        p.set_value(PARA[idx])		
	        """
        l_local_a_inf = lasagne.layers.DenseLayer(
            concat2_a,
            num_units=25,
            nonlinearity=lasagne.nonlinearities.softmax)

        y_in = T.ftensor3()
        y = T.imatrix()
        g = T.imatrix()
        gmask = T.fmatrix()
        y_mask = T.fmatrix()
        length = T.iscalar()

        predy0 = lasagne.layers.get_output(l_local_a, {
            l_in_word_a: g,
            l_mask_word_a: gmask
        })
        predy = predy0.reshape((-1, length, 25))

        predy0_inf = lasagne.layers.get_output(l_local_a_inf, {
            l_in_word_a: g,
            l_mask_word_a: gmask
        })
        predy_inf = predy0_inf.reshape((-1, length, 25))

        #predy = predy * gmask[:,:,None]
        #newpredy = T.concatenate([predy, y0] , axis=2)
        # n , L, 46, 46
        # predy0: n, L, 25

        # energy loss
        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy = tg_energy + T.sum(new_ta_energy * targets_one_step,
                                              axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy, tg_energy)
            return [targets_one_step, new_ta_energy]

        # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels)
        local_energy = lasagne.layers.get_output(l_local, {
            l_in_word: g,
            l_mask_word: gmask
        })
        local_energy = local_energy.reshape((-1, length, 25))
        local_energy = local_energy * gmask[:, :, None]
        targets_shuffled = y_in.dimshuffle(1, 0, 2)
        masks_shuffled = gmask.dimshuffle(1, 0)
        # initials should be energies_shuffles[0, :, -1, :]

        target_time0 = targets_shuffled[0]
        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])
        length_index = T.sum(gmask, axis=1) - 1
        length_index = T.cast(length_index, 'int32')
        """for ground-truth energy"""
        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])
        pos_end_target = y_in[T.arange(length_index.shape[0]), length_index]
        pos_cost = target_energies[-1] + T.sum(
            T.sum(local_energy * y_in, axis=2) * gmask, axis=1) + T.dot(
                pos_end_target, Wyy[:-1, -1])
        check = T.sum(T.sum(local_energy * y_in, axis=2) * gmask, axis=1)
        """for cost-augmented InfNet"""
        negtargets_shuffled = predy.dimshuffle(1, 0, 2)
        negtarget_time0 = negtargets_shuffled[0]
        neginitial_energy0 = T.dot(negtarget_time0, Wyy[-1, :-1])
        neginitials = [negtarget_time0, neginitial_energy0]
        [_, negtarget_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=neginitials,
            sequences=[negtargets_shuffled[1:], masks_shuffled[1:]])
        neg_end_target = predy[T.arange(length_index.shape[0]), length_index]
        neg_cost = negtarget_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * gmask, axis=1) + T.dot(
                neg_end_target, Wyy[:-1, -1])
        """for InfNet"""
        negtargets_inf_shuffled = predy_inf.dimshuffle(1, 0, 2)
        negtarget_inf_time0 = negtargets_inf_shuffled[0]
        neginitial_inf_energy0 = T.dot(negtarget_inf_time0, Wyy[-1, :-1])
        neginitials_inf = [negtarget_inf_time0, neginitial_inf_energy0]
        [_, negtarget_inf_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=neginitials_inf,
            sequences=[negtargets_inf_shuffled[1:], masks_shuffled[1:]])
        neg_inf_end_target = predy_inf[T.arange(length_index.shape[0]),
                                       length_index]
        neg_inf_cost = negtarget_inf_energies[-1] + T.sum(
            T.sum(local_energy * predy_inf, axis=2) * gmask, axis=1) + T.dot(
                neg_inf_end_target, Wyy[:-1, -1])

        y_f = y.flatten()
        predy_f = predy.reshape((-1, 25))

        ce_hinge = lasagne.objectives.categorical_crossentropy(
            predy_f + eps, y_f)
        ce_hinge = ce_hinge.reshape((-1, length))
        ce_hinge = T.sum(ce_hinge * gmask, axis=1)

        predy_inf_f = predy_inf.reshape((-1, 25))

        ce_hinge_inf = lasagne.objectives.categorical_crossentropy(
            predy_inf_f + eps, y_f)
        ce_hinge_inf = ce_hinge_inf.reshape((-1, length))
        ce_hinge_inf = T.sum(ce_hinge_inf * gmask, axis=1)

        entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1)
        entropy_term = entropy_term.reshape((-1, length))
        entropy_term = T.sum(entropy_term * gmask, axis=1)

        delta0 = T.sum(abs((y_in - predy)), axis=2) * gmask
        delta0 = T.sum(delta0, axis=1)

        hinge_cost_inf = neg_inf_cost - pos_cost

        if (params.margin_type == 1):
            hinge_cost0 = 1 + neg_cost - pos_cost
        elif (params.margin_type == 2):
            hinge_cost0 = neg_cost - pos_cost
        elif (params.margin_type == 0):
            hinge_cost0 = delta0 + neg_cost - pos_cost
        elif (params.margin_type == 3):
            hinge_cost0 = delta0 * (1.0 + neg_cost - pos_cost)

        #g_cost =  T.mean(T.maximum(-hinge_cost0, 0.0))
        if (params.regu_type == 0):
            g_cost = T.mean(-hinge_cost0) + 10 * T.mean(
                -hinge_cost_inf) + T.mean(ce_hinge) + T.mean(ce_hinge_inf)
        else:
            g_cost = T.mean(-hinge_cost0) + 10 * T.mean(-hinge_cost_inf)
        #g_cost_later = T.mean(-hinge_cost0)

        d_cost = T.mean(T.maximum(hinge_cost0, 0.0)) + params.Lambda * T.mean(
            T.maximum(hinge_cost_inf, 0.0))

        #hinge_cost = hinge_cost0 * T.gt(hinge_cost0, 0)
        #d_cost = T.sum(hinge_cost)
        #d_cost0 = d_cost
        ###l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params))

        #hinge_cost_g = hinge_cost0 * T.lt(hinge_cost0, 0)
        #d_cost0_g = T.mean(hinge_cost_g)
        """select different regulizer"""
        ###g_cost = -d_cost0 + params.l2* sum(lasagne.regularization.l2(x) for x in a_params) + params.l3*T.mean(ce_hinge)
        #g_cost = -d_cost0_g

        #g_cost_final = -T.mean(hinge_cost_g) + params.l2* sum(lasagne.regularization.l2(x) for x in a_params)

        #d_cost = d_cost

        #g_cost = -T.mean(hinge_cost_g)
        #d_cost = T.mean(hinge_cost0)

        a_params = lasagne.layers.get_all_params([l_local_a, l_local_a_inf],
                                                 trainable=True)
        updates_g = lasagne.updates.sgd(g_cost, a_params, params.eta)
        updates_g = lasagne.updates.apply_momentum(updates_g,
                                                   a_params,
                                                   momentum=0.9)
        #updates_g = lasagne.updates.adam(g_cost, a_params, 0.001)
        #updates_g_later = lasagne.updates.adam(g_cost_later, a_params, 0.0006)

        self.train_g = theano.function(
            [g, gmask, y, y_in, length],
            [g_cost, d_cost, pos_cost, neg_cost, delta0, check],
            updates=updates_g,
            on_unused_input='ignore')
        #self.train_g_later = theano.function([g, gmask, y, y_in, length], [g_cost, d_cost, pos_cost, neg_cost, delta0, check], updates=updates_g_later, on_unused_input='ignore')

        #updates_d = lasagne.updates.sgd(d_cost, d_params, params.eta)
        #updates_d = lasagne.updates.apply_momentum(updates_d, d_params, momentum=0.9)

        updates_d = lasagne.updates.adam(d_cost, d_params, 0.001)
        self.train_d = theano.function(
            [g, gmask, y, y_in, length],
            [d_cost, g_cost, pos_cost, neg_cost, delta0, check],
            updates=updates_d,
            on_unused_input='ignore')
        """build the function for the test time inference"""
        pred = T.argmax(predy_inf, axis=2)
        pg = T.eq(pred, y)
        pg = pg * gmask
        acc_inf = 1.0 * T.sum(pg) / T.sum(gmask)

        pred = T.argmax(predy, axis=2)
        pg = T.eq(pred, y)
        pg = pg * gmask
        acc_cost = 1.0 * T.sum(pg) / T.sum(gmask)

        self.test_time = theano.function([g, gmask, y, length],
                                         [acc_inf, acc_cost])
Ejemplo n.º 10
0
	def __init__(self,  We_initial, char_embedd_table_initial, params):
		self.textfile = open(params.outfile, 'w')
		We = theano.shared(We_initial)
        	embsize = We_initial.shape[1]
        	hidden = params.hidden


                char_embedd_dim = params.char_embedd_dim
                char_dic_size = len(params.char_dic)
                char_embedd_table = theano.shared(char_embedd_table_initial)

		trans = np.random.uniform(-0.01, 0.01, (18, 18)).astype('float32')
		transition = theano.shared(trans)


		input_var = T.imatrix(name='inputs')
        	target_var = T.imatrix(name='targets')
        	mask_var = T.fmatrix(name='masks')
		mask_var1 = T.fmatrix(name='masks1')
		length = T.iscalar()
                char_input_var = T.itensor3(name='char-inputs')

                l_in_word = lasagne.layers.InputLayer((None, None))
                l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

		if params.emb ==1:
                        l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word,  input_size= We_initial.shape[0] , output_size = embsize, W =We, name='word_embedding')
                else:
                        l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)


                layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ),
                                                     input_var=char_input_var, name='char-input')

                layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
                layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char, input_size=char_dic_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table,
                                                             name='char_embedding')

                layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1))


                # first get some necessary dimensions or parameters
                conv_window = 3
                num_filters = params.num_filters
                #_, sent_length, _ = incoming2.output_shape

                # dropout before cnn?
                if params.dropout:
                     layer_char = lasagne.layers.DropoutLayer(layer_char, p=0.5)

                # construct convolution layer
                cnn_layer = lasagne.layers.Conv1DLayer(layer_char, num_filters=num_filters, filter_size=conv_window, pad='full',
                                           nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
                # infer the pool size for pooling (pool size should go through all time step of cnn)
                _, _, pool_size = cnn_layer.output_shape
               
                # construct max pool layer
                pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size)
                # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
                output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1]))

                # finally, concatenate the two incoming layers together.
                incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2)
                if params.dropout:
                     incoming = lasagne.layers.DropoutLayer(incoming, p=0.5)


		l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, grad_clipping=5.)
        	l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, grad_clipping=5., backwards = True)

        	concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)
	
	        if params.dropout:
                     concat = lasagne.layers.DropoutLayer(concat, p=0.5)
            
		l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden))

		l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= 17, nonlinearity=lasagne.nonlinearities.linear)


        	#bi_lstm_crf = CRFLayer(concat, params.num_labels, mask_input= l_mask_word)


		local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var})
		local_energy = local_energy.reshape((-1, length, 17))
                local_energy = local_energy*mask_var[:,:,None]		

		end_term = transition[:-1,-1]
                local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None]
		
                local_energy_eval = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var}, deterministic=True)
                local_energy_eval = local_energy_eval.reshape((-1, length, 17))
                local_energy_eval = local_energy_eval*mask_var[:,:,None]
                local_energy_eval = local_energy_eval + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None]


        	#energies_train = lasagne.layers.get_output(bi_lstm_crf, {l_in_word: input_var, l_mask_word: mask_var})

        	loss_train = crf_loss0(local_energy,  transition, target_var, mask_var).mean()

        	prediction, corr = crf_accuracy0(local_energy_eval, transition, target_var, mask_var)


		##loss_train = crf_loss(energies_train, target_var, mask_var).mean()

                ##prediction, corr = crf_accuracy(energies_train, target_var)


        	corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        	num_tokens = mask_var.sum(dtype=theano.config.floatX)



        	network_params = lasagne.layers.get_all_params(l_local, trainable=True)
		network_params.append(transition)

        	print network_params
		self.network_params = network_params

		loss_train = loss_train + params.L2*sum(lasagne.regularization.l2(x) for x in network_params)

                #updates = lasagne.updates.adam(loss_train, network_params, params.eta)
        	updates = lasagne.updates.sgd(loss_train, network_params, params.eta)
                updates = lasagne.updates.apply_momentum(updates, network_params, momentum=0.9)

        	self.train_fn = theano.function([input_var, char_input_var, target_var, mask_var, mask_var1, length], loss_train, updates=updates, on_unused_input='ignore')

        	self.eval_fn = theano.function([input_var, char_input_var, target_var, mask_var, mask_var1, length], [corr_train, num_tokens, prediction], on_unused_input='ignore')
Ejemplo n.º 11
0
    def __init__(self, We_initial, params):
        self.eta = params.eta
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden
        g = T.imatrix()
        gmask = T.fmatrix()
        y = T.ivector()
        idxs = T.ivector()

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))
        l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)
        if params.dropout:
            l_emb_word = lasagne.layers.DropoutLayer(l_emb_word, p=0.5)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word,
                                                backwards=True)
        l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, hidden))
        l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, hidden))
        concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb])
        if params.dropout:
            concat2 = lasagne.layers.DropoutLayer(concat2, p=0.5)

        l_out = lasagne.layers.DenseLayer(
            concat2, num_units=25, nonlinearity=lasagne.nonlinearities.softmax)
        output = lasagne.layers.get_output(l_out, {
            l_in_word: g,
            l_mask_word: gmask
        })
        output_1 = output[idxs]
        test_output = lasagne.layers.get_output(l_out, {
            l_in_word: g,
            l_mask_word: gmask
        },
                                                deterministic=True)
        test_output_1 = test_output[idxs]
        model_params = lasagne.layers.get_all_params(l_out, trainable=True)
        self.model_p = lasagne.layers.get_all_params(l_out, trainable=True)
        reg = sum(lasagne.regularization.l2(x) for x in model_params)
        cost = lasagne.objectives.categorical_crossentropy(output_1, y)
        cost = T.mean(cost) + params.L2 * reg

        final_pred = T.argmax(test_output_1, axis=1)
        y1 = T.ones_like(y)
        SUM = T.sum(y1)
        acc = 1.0 * T.sum(T.eq(final_pred, y)) / SUM
        self.acc_function = theano.function([g, gmask, y, idxs],
                                            acc,
                                            on_unused_input='warn')
        #updates = lasagne.updates.adam(cost, model_params, self.eta)
        updates = lasagne.updates.sgd(cost, model_params, self.eta)
        updates = lasagne.updates.apply_momentum(updates,
                                                 model_params,
                                                 momentum=0.9)
        self.train_function = theano.function([g, gmask, y, idxs], [cost, acc],
                                              updates=updates,
                                              on_unused_input='warn')
Ejemplo n.º 12
0
    def __init__(self, We_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden
        hidden_inf = params.hidden_inf

        input_var = T.imatrix(name='inputs')
        target_var = T.imatrix(name='targets')
        mask_var = T.fmatrix(name='masks')
        mask_var1 = T.fmatrix(name='masks1')
        length = T.iscalar()
        t_t = T.fscalar()

        Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32')
        Wyy = theano.shared(Wyy0)

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word,
                                                backwards=True)

        concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)

        l_reshape_concat = lasagne.layers.ReshapeLayer(concat,
                                                       (-1, 2 * hidden))

        l_local = lasagne.layers.DenseLayer(
            l_reshape_concat,
            num_units=25,
            nonlinearity=lasagne.nonlinearities.linear)

        network_params = lasagne.layers.get_all_params(l_local, trainable=True)
        network_params.append(Wyy)

        print len(network_params)
        f = open(
            'POS_CRF_lstm_pretrain.Batchsize_10_dropout_0_LearningRate_0.1_1e-050_emb_0.pickle',
            'r')
        data = pickle.load(f)
        f.close()

        for idx, p in enumerate(network_params):

            p.set_value(data[idx])

        l_in_word_a = lasagne.layers.InputLayer((None, None))
        l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None))

        ##if params.small:

        ##	We_small_init = np.random.uniform(-0.1, 0.1, (We_initial.shape[0], hidden_inf)).astype('float32')
        ##	We_small = theano.shared(We_small_init)
        ##	l_emb_word_a = lasagne.layers.EmbeddingLayer(l_in_word_a,  input_size= We_small_init.shape[0] , output_size = hidden_inf, W =We_small)
        ##else:
        l_emb_word_a = lasagne_embedding_layer_2(l_in_word_a, embsize,
                                                 l_emb_word.W)

        if (params.inf == 0):
            l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                      hidden_inf,
                                                      mask_input=l_mask_word_a)
            l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                      hidden_inf,
                                                      mask_input=l_mask_word_a,
                                                      backwards=True)

            l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a,
                                                       (-1, hidden_inf))
            l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a,
                                                       (-1, hidden_inf))
            concat2_a = lasagne.layers.ConcatLayer(
                [l_reshapef_a, l_reshapeb_a])
        else:
            l_cnn_input_a = lasagne.layers.DimshuffleLayer(
                l_emb_word_a, (0, 2, 1))
            l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a,
                                                   hidden_inf,
                                                   1,
                                                   1,
                                                   pad='same')
            l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a,
                                                   hidden_inf,
                                                   3,
                                                   1,
                                                   pad='same')
            l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a],
                                                 axis=1)
            #l_cnn_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 1, 1, pad = 'same')
            concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1))
            #concat2_a = lasagne.layers.ConcatLayer([l_emb_word, concat2], axis =2)
            concat2_a = lasagne.layers.ReshapeLayer(concat2_a,
                                                    (-1, 2 * hidden_inf))

        if params.dropout:
            concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5)

        l_local_a = lasagne.layers.DenseLayer(
            concat2_a,
            num_units=25,
            nonlinearity=lasagne.nonlinearities.softmax)

        a_params = lasagne.layers.get_all_params(l_local_a, trainable=True)
        self.a_params = a_params

        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy_t = tg_energy + T.sum(
                new_ta_energy * targets_one_step, axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy)

            return [targets_one_step, tg_energy_t]

        local_energy = lasagne.layers.get_output(l_local, {
            l_in_word: input_var,
            l_mask_word: mask_var
        })
        local_energy = local_energy.reshape((-1, length, 25))
        local_energy = local_energy * mask_var[:, :, None]

        #####################
        # for the end symbole of a sequence
        ####################

        end_term = Wyy[:-1, -1]
        local_energy = local_energy + end_term.dimshuffle(
            'x', 'x', 0) * mask_var1[:, :, None]

        predy0 = lasagne.layers.get_output(l_local_a, {
            l_in_word_a: input_var,
            l_mask_word_a: mask_var
        })

        predy_in = T.argmax(predy0, axis=1)
        A = T.extra_ops.to_one_hot(predy_in, 25)
        A = A.reshape((-1, length, 25))

        predy = predy0.reshape((-1, length, 25))
        predy = predy * mask_var[:, :, None]

        targets_shuffled = predy.dimshuffle(1, 0, 2)
        target_time0 = targets_shuffled[0]

        masks_shuffled = mask_var.dimshuffle(1, 0)

        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])

        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])
        cost11 = target_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * mask_var, axis=1)

        # compute the ground-truth energy

        targets_shuffled0 = A.dimshuffle(1, 0, 2)
        target_time00 = targets_shuffled0[0]

        initial_energy00 = T.dot(target_time00, Wyy[-1, :-1])

        initials0 = [target_time00, initial_energy00]
        [_, target_energies0], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials0,
            sequences=[targets_shuffled0[1:], masks_shuffled[1:]])
        cost110 = target_energies0[-1] + T.sum(
            T.sum(local_energy * A, axis=2) * mask_var, axis=1)

        predy_f = predy.reshape((-1, 25))
        y_f = target_var.flatten()

        if (params.annealing == 0):
            lamb = params.L3
        elif (params.annealing == 1):
            lamb = params.L3 * (1 - 0.01 * t_t)

        if (params.regutype == 0):
            ce_hinge = lasagne.objectives.categorical_crossentropy(
                predy_f + eps, y_f)
            ce_hinge = ce_hinge.reshape((-1, length))
            ce_hinge = T.sum(ce_hinge * mask_var, axis=1)
            cost = T.mean(-cost11) + lamb * T.mean(ce_hinge)
        else:

            entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1)
            entropy_term = entropy_term.reshape((-1, length))
            entropy_term = T.sum(entropy_term * mask_var, axis=1)
            cost = T.mean(-cost11) - lamb * T.mean(entropy_term)
        """
		f = open('F0_simple.pickle')
                PARA = pickle.load(f)
                f.close()
                l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params))


                cost = T.mean(-cost11) + params.L2*l2_term
		"""

        #from adam import adam
        #updates_a = adam(cost, a_params, params.eta)

        updates_a = lasagne.updates.sgd(cost, a_params, params.eta)
        updates_a = lasagne.updates.apply_momentum(updates_a,
                                                   a_params,
                                                   momentum=0.9)

        if (params.regutype == 0):
            self.train_fn = theano.function(
                [input_var, target_var, mask_var, mask_var1, length, t_t],
                [cost, ce_hinge],
                updates=updates_a,
                on_unused_input='ignore')
        else:
            self.train_fn = theano.function(
                [input_var, target_var, mask_var, mask_var1, length, t_t],
                [cost, entropy_term],
                updates=updates_a,
                on_unused_input='ignore')

        prediction = T.argmax(predy, axis=2)
        corr = T.eq(prediction, target_var)
        corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        num_tokens = mask_var.sum(dtype=theano.config.floatX)

        self.eval_fn = theano.function(
            [input_var, target_var, mask_var, mask_var1, length],
            [cost11, cost110, corr_train, num_tokens, prediction],
            on_unused_input='ignore')
Ejemplo n.º 13
0
    def __init__(self,   We_initial,  params):

       
     
	self.eta = params.eta
        We = theano.shared(We_initial)
	embsize = We_initial.shape[1]
	hidden = params.hidden

	g = T.imatrix()
        gmask = T.fmatrix()
        y = T.ivector()
	idxs = T.ivector()


        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

	if params.emb ==1:
                        l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word,  input_size= We_initial.shape[0] , output_size = embsize, W =We)
        else:
                        l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)
        #l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word,  input_size= We_initial.shape[0]   , output_size = embsize , W =We)
	#l_emb_word = lasagne_embedding_layer_2(l_in_word,  embsize , We)
	
    	if params.dropout:
        	l_emb_word = lasagne.layers.DropoutLayer(l_emb_word, p=0.5)
	
	
	if (params.inf==0):
                        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word)
                        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards = True)

                        l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf,(-1,hidden))
                        l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb,(-1,hidden))
                        concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb])
        elif(params.inf==1) :
                        l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1))
                        l_cnn_1 = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 1, 1, pad = 'same')
                        l_cnn_3 = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 3, 1, pad = 'same')
			l_cnn = lasagne.layers.ConcatLayer([l_cnn_1, l_cnn_3], axis=1)			
			#l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 1, 1, pad = 'same')
                        concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1))
                        #concat2 = lasagne.layers.ConcatLayer([l_emb_word, concat2], axis =2)
                        concat2 = lasagne.layers.ReshapeLayer(concat2 ,(-1, 2*hidden))
	else:
			l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1))
                        l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 3, 1, pad = 'same')
			concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1))
			concat2 = lasagne.layers.ReshapeLayer(concat2 ,(-1, hidden))
			concat2 = lasagne.layers.DenseLayer(concat2, num_units= hidden)

	
    	if params.dropout:
        	concat2 = lasagne.layers.DropoutLayer(concat2, p=0.5)

        #l_emb = lasagne.layers.DenseLayer(concat2, num_units=hidden, nonlinearity=lasagne.nonlinearities.tanh)
        l_out = lasagne.layers.DenseLayer(concat2, num_units= params.num_labels, nonlinearity=lasagne.nonlinearities.softmax)


	output = lasagne.layers.get_output(l_out, {l_in_word: g, l_mask_word: gmask})
	
	output_1= output[idxs]

	test_output = lasagne.layers.get_output(l_out, {l_in_word: g, l_mask_word: gmask}, deterministic=True)

        test_output_1= test_output[idxs]

	model_params = lasagne.layers.get_all_params(l_out, trainable=True)
	self.model_p = lasagne.layers.get_all_params(l_out, trainable=True)

	reg = sum(lasagne.regularization.l2(x) for x in model_params)

	cost = lasagne.objectives.categorical_crossentropy(output_1, y)	
	cost = T.mean(cost) + params.L2 * reg

	#pred = T.argmax(output_1, axis=1)
	final_pred = T.argmax(test_output_1, axis=1)

 	y1 = T.ones_like(y)
        SUM = T.sum(y1)
        acc = T.sum(T.eq(final_pred, y))

        ###acc = 1.0 * T.sum(T.eq(final_pred, y))/SUM

	self.acc_function = theano.function([g, gmask, y, idxs], [acc, SUM], on_unused_input='warn')	

        ##from adam import adam
        ##updates = adam(cost, model_params, self.eta)

	#updates = lasagne.updates.adam(cost, model_params, self.eta)
	updates = lasagne.updates.sgd(cost, model_params, self.eta)
        updates = lasagne.updates.apply_momentum(updates, model_params, momentum=0.9)
	self.train_function = theano.function([g, gmask, y, idxs], [cost, acc], updates=updates, on_unused_input='warn')	
	def __init__(self,  We_initial, params):
		#self.textfile = open(params.outfile, 'w')
		We = theano.shared(We_initial)
               

        	embsize = We_initial.shape[1]
        	hidden = params.hidden

                input_init = np.random.uniform(-0.1, 0.1, (10, MAX_lENGTH, params.num_labels)).astype('float32')
                self.input_init = theano.shared(input_init)

		

		input_var = T.imatrix(name='inputs')
        	target_var = T.imatrix(name='targets')
        	mask_var = T.fmatrix(name='masks')
		mask_var1 = T.fmatrix(name='masks1')
		length = T.iscalar()
			
                                

		Wyy0 = np.random.uniform(-0.02, 0.02, (params.num_labels + 1, params.num_labels)).astype('float32')
                Wyy = theano.shared(Wyy0)
 

                l_in_word = lasagne.layers.InputLayer((None, None))
                l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

		if params.emb ==1:
                        l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word,  input_size= We_initial.shape[0] , output_size = embsize, W =We)
                else:
                        l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)


                
		l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word)
        	l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards = True)

        	concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)
		
		l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden))

		l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= params.num_labels, nonlinearity=lasagne.nonlinearities.linear)

		
		network_params = lasagne.layers.get_all_params(l_local, trainable=True)
                network_params.append(Wyy)

		
		#print len(network_params)
		f = open('ccctag_CRF_Bilstm_Viterbi_.Batchsize_10_dropout_0_LearningRate_0.01_0.0512_tagversoin_2.pickle','r')
		data = pickle.load(f)
		f.close()

		for idx, p in enumerate(network_params):

                        p.set_value(data[idx])

	
	
                
	
		def inner_function( targets_one_step, mask_one_step,  prev_label, tg_energy):
                        """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """                 
                        new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1])
                        new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1)
			tg_energy_t = T.switch(mask_one_step, new_ta_energy_t,  tg_energy)

                        return [targets_one_step, tg_energy_t]


		local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var})
		local_energy = local_energy.reshape((-1, length, params.num_labels))
                local_energy = local_energy*mask_var[:,:,None]		

		#####################
		# for the end symbole of a sequence
		####################

		end_term = Wyy[:-1,-1]
                local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None]

                
                predy_init = self.input_init[:,:length,:]

                a_params = [self.input_init]


                predy = T.nnet.softmax(predy_init.reshape((-1, params.num_labels)))
                predy = predy.reshape((-1, length, params.num_labels))

                prediction = T.argmax(predy_init, axis=2)

                predy = predy*mask_var[:,:,None]

		
		
		targets_shuffled = predy.dimshuffle(1, 0, 2)
                target_time0 = targets_shuffled[0]
		
		masks_shuffled = mask_var.dimshuffle(1, 0)		 

                initial_energy0 = T.dot(target_time0, Wyy[-1,:-1])


                initials = [target_time0, initial_energy0]
                [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]])
                cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1)
				
		
		predy_f =  predy.reshape((-1, params.num_labels))
		y_f = target_var.flatten()

	
		if (params.annealing ==0):
                        lamb = params.L3
                elif (params.annealing ==1):
                        lamb = params.L3* (1 - 0.01*t_t)


		cost = T.mean(-cost11)
                		   

		#from adam import adam
                #updates_a = adam(cost, a_params, params.eta)
					
		updates_a = lasagne.updates.sgd(cost, a_params, params.eta)
                updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9)

	        self.inf_fn = theano.function([input_var, mask_var, mask_var1, length], cost, updates = updates_a)
                self.eval_fn = theano.function([input_var, mask_var, mask_var1, length], [prediction, -cost11], on_unused_input='ignore')	
Ejemplo n.º 15
0
    def __init__(self, We_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden
        start0 = np.random.uniform(-0.02, 0.02, (1, 26)).astype('float32')
        end0 = np.zeros((1, 26)).astype('float32')
        end0[0, -1] = 1.0
        start = theano.shared(start0)
        end = theano.shared(end0)

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word,
                                                backwards=True)
        l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, hidden))
        l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, hidden))
        concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb])
        l_local = lasagne.layers.DenseLayer(
            concat2, num_units=25, nonlinearity=lasagne.nonlinearities.linear)
        f_params = lasagne.layers.get_all_params(l_local, trainable=True)

        Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32')
        Wyy = theano.shared(Wyy0)
        d_params = lasagne.layers.get_all_params(l_local, trainable=True)
        d_params.append(Wyy)
        self.d_params = d_params

        l_in_word_a = lasagne.layers.InputLayer((None, None))
        l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None))
        l_emb_word_a = lasagne_embedding_layer_2(l_in_word_a, embsize,
                                                 l_emb_word.W)

        if params.dropout:
            l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5)

        l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                  hidden,
                                                  mask_input=l_mask_word_a)
        l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                  hidden,
                                                  mask_input=l_mask_word_a,
                                                  backwards=True)
        l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a,
                                                   (-1, hidden))
        l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a,
                                                   (-1, hidden))
        concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a])
        if params.dropout:
            concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5)
        l_local_a = lasagne.layers.DenseLayer(
            concat2_a,
            num_units=25,
            nonlinearity=lasagne.nonlinearities.softmax)

        a_params = lasagne.layers.get_all_params(l_local_a, trainable=True)
        self.a_params = a_params

        y_in = T.ftensor3()
        y = T.imatrix()
        g = T.imatrix()
        gmask = T.fmatrix()
        y_mask = T.fmatrix()
        length = T.iscalar()
        # shape: n, L, 1
        #y1 = T.ftensor3()
        # shape: n, 1, 46

        predy0 = lasagne.layers.get_output(l_local_a, {
            l_in_word_a: g,
            l_mask_word_a: gmask
        })
        predy = predy0.reshape((-1, length, 25))
        predy = predy * gmask[:, :, None]

        #newpredy = T.concatenate([predy, y0] , axis=2)
        # n , L, 46, 46
        # predy0: n, L, 25
        # energy loss
        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy = tg_energy + T.sum(new_ta_energy * targets_one_step,
                                              axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy, tg_energy)
            return [targets_one_step, new_ta_energy]

        # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels)
        # but scan requires the iterable dimension to be first
# So, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels)
        local_energy = lasagne.layers.get_output(l_local, {
            l_in_word: g,
            l_mask_word: gmask
        })
        local_energy = local_energy.reshape((-1, length, 25))
        local_energy = local_energy * gmask[:, :, None]

        targets_shuffled = y_in.dimshuffle(1, 0, 2)
        masks_shuffled = gmask.dimshuffle(1, 0)
        target_time0 = targets_shuffled[0]
        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])

        length_index = T.sum(gmask, axis=1) - 1
        length_index = T.cast(length_index, 'int32')

        l_LM_in = lasagne.layers.InputLayer((None, None, 26))
        l_LM_mask = lasagne.layers.InputLayer(shape=(None, None))
        l_LM_lstm = lasagne.layers.LSTMLayer(l_LM_in,
                                             2 * hidden,
                                             mask_input=l_LM_mask)
        l_reshape_LM = lasagne.layers.ReshapeLayer(l_LM_lstm, (-1, 2 * hidden))
        l_LM = lasagne.layers.DenseLayer(
            l_reshape_LM,
            num_units=26,
            nonlinearity=lasagne.nonlinearities.softmax)

        LM_params = lasagne.layers.get_all_params(l_LM, trainable=True)
        LM_params.append(start)

        f = open('Label_LM.pickle', 'r')
        data = pickle.load(f)
        f.close()
        for idx, p in enumerate(LM_params):
            p.set_value(data[idx])

        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])

        pos_end_target = y_in[T.arange(length_index.shape[0]), length_index]
        """add ground truth labels LM cost"""
        pos_predy_tmp0 = y_in[:, :, 0].reshape((-1, length, 1))
        pos_tmp0 = T.zeros_like(pos_predy_tmp0)
        pos_predy_lm = T.concatenate([y_in, pos_tmp0], axis=2)

        pos_predy_tmp = pos_predy_lm[:, 0, :].reshape((-1, 1, 26))
        pos_tmp = T.ones_like(pos_predy_tmp)

        sos = pos_tmp * (start.dimshuffle('x', 0, 1))
        eos = pos_tmp * (end.dimshuffle('x', 0, 1))
        pos_y_lm_in = T.concatenate([sos, pos_predy_lm], axis=1)
        pos_y_lm_out = T.concatenate([pos_predy_lm, eos], axis=1)

        pos_lm_mask_var = T.concatenate(
            [pos_tmp[:, 0, 0].reshape((-1, 1)), gmask], axis=1)
        pos_LM_out = lasagne.layers.get_output(l_LM, {
            l_LM_in: pos_y_lm_in,
            l_LM_mask: pos_lm_mask_var
        })
        pos_LM_out = pos_LM_out.reshape((-1, length + 1, 26))
        pos_LM_cost = T.sum(T.log(
            T.sum(pos_LM_out[:, :-1, :] * pos_y_lm_out[:, :-1, :], axis=2) +
            eps) * gmask,
                            axis=1)

        pos_cost = target_energies[-1] + T.sum(
            T.sum(local_energy * y_in, axis=2) * gmask, axis=1) + T.dot(
                pos_end_target, Wyy[:-1, -1]) + params.lm * pos_LM_cost
        check = T.sum(T.sum(local_energy * y_in, axis=2) * gmask, axis=1)

        negtargets_shuffled = predy.dimshuffle(1, 0, 2)
        negtarget_time0 = negtargets_shuffled[0]
        neginitial_energy0 = T.dot(negtarget_time0, Wyy[-1, :-1])
        """predict label language cost"""

        neg_predy_tmp0 = predy[:, :, 0].reshape((-1, length, 1))
        neg_tmp0 = T.zeros_like(neg_predy_tmp0)
        neg_predy_lm = T.concatenate([predy, neg_tmp0], axis=2)

        neg_predy_tmp = neg_predy_lm[:, 0, :].reshape((-1, 1, 26))
        neg_tmp = T.ones_like(neg_predy_tmp)

        sos = neg_tmp * (start.dimshuffle('x', 0, 1))
        eos = neg_tmp * (end.dimshuffle('x', 0, 1))

        neg_y_lm_in = T.concatenate([sos, neg_predy_lm], axis=1)
        neg_y_lm_out = T.concatenate([neg_predy_lm, eos], axis=1)

        neg_lm_mask_var = T.concatenate(
            [neg_tmp[:, 0, 0].reshape((-1, 1)), gmask], axis=1)
        neg_LM_out = lasagne.layers.get_output(l_LM, {
            l_LM_in: neg_y_lm_in,
            l_LM_mask: neg_lm_mask_var
        })
        neg_LM_out = neg_LM_out.reshape((-1, length + 1, 26))
        neg_LM_cost = T.sum(T.log(
            T.sum(neg_LM_out[:, :-1, :] * neg_y_lm_out[:, :-1, :], axis=2) +
            eps) * gmask,
                            axis=1)

        neginitials = [negtarget_time0, neginitial_energy0]
        [_, negtarget_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=neginitials,
            sequences=[negtargets_shuffled[1:], masks_shuffled[1:]])

        neg_end_target = predy[T.arange(length_index.shape[0]), length_index]
        neg_cost = negtarget_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * gmask, axis=1) + T.dot(
                neg_end_target, Wyy[:-1, -1]) + params.lm * neg_LM_cost

        y_f = y.flatten()
        predy_f = predy.reshape((-1, 25))

        ce_hinge = lasagne.objectives.categorical_crossentropy(
            predy_f + eps, y_f)
        ce_hinge = ce_hinge.reshape((-1, length))
        ce_hinge = T.sum(ce_hinge * gmask, axis=1)

        entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1)
        entropy_term = entropy_term.reshape((-1, length))
        entropy_term = T.sum(entropy_term * gmask, axis=1)

        delta0 = T.sum(abs((y_in - predy)), axis=2) * gmask
        delta0 = T.sum(delta0, axis=1)
        hinge_cost = delta0 + neg_cost - pos_cost
        hinge_cost = hinge_cost * T.gt(hinge_cost, 0)
        d_cost = T.mean(hinge_cost)
        d_cost0 = d_cost
        """select different regulizer"""
        g_cost = -d_cost0 + params.l2 * sum(
            lasagne.regularization.l2(x)
            for x in a_params) + params.l3 * T.mean(ce_hinge)
        ###g_cost = -d_cost0 + params.L2* sum(lasagne.regularization.l2(x) for x in a_params) - params.L31*T.mean(entropy_term)
        d_cost = d_cost0 + params.l2 * sum(
            lasagne.regularization.l2(x) for x in d_params)

        self.a_params = a_params
        updates_g = lasagne.updates.sgd(g_cost, a_params, params.eta)
        updates_g = lasagne.updates.apply_momentum(updates_g,
                                                   a_params,
                                                   momentum=0.9)
        self.train_g = theano.function(
            [g, gmask, y, y_in, length],
            [g_cost, d_cost0, pos_cost, neg_cost, delta0, check],
            updates=updates_g,
            on_unused_input='ignore')
        updates_d = lasagne.updates.adam(d_cost, d_params, 0.001)
        self.train_d = theano.function(
            [g, gmask, y, y_in, length],
            [d_cost, d_cost0, pos_cost, neg_cost, delta0, check],
            updates=updates_d,
            on_unused_input='ignore')

        # test the model and retuning the infernce network
        predy_test = lasagne.layers.get_output(l_local_a, {
            l_in_word_a: g,
            l_mask_word_a: gmask
        },
                                               deterministic=True)
        predy_test = predy_test.reshape((-1, length, 25))
        pred = T.argmax(predy_test, axis=2)
        pg = T.eq(pred, y)
        pg = pg * gmask
        acc = 1.0 * T.sum(pg) / T.sum(gmask)

        negtargets_shuffled_test = predy_test.dimshuffle(1, 0, 2)
        negtarget_time0_test = negtargets_shuffled_test[0]

        neginitial_energy0_test = T.dot(negtarget_time0_test, Wyy[-1, :-1])
        neginitials_test = [negtarget_time0_test, neginitial_energy0_test]
        [_, negtarget_energies_test], _ = theano.scan(
            fn=inner_function,
            outputs_info=neginitials_test,
            sequences=[negtargets_shuffled_test[1:], masks_shuffled[1:]])
        end_test_target = predy_test[T.arange(length_index.shape[0]),
                                     length_index]
        neg_cost_test = negtarget_energies_test[-1] + T.sum(
            T.sum(local_energy * predy_test, axis=2) * gmask, axis=1) + T.dot(
                end_test_target, Wyy[:-1, -1])

        test_cost = -T.mean(neg_cost_test) + params.l3 * T.mean(
            ce_hinge) - params.lm * T.mean(neg_LM_cost)
        test_updates = lasagne.updates.sgd(test_cost, a_params, params.eta)
        test_updates = lasagne.updates.apply_momentum(test_updates,
                                                      a_params,
                                                      momentum=0.9)
        self.test_time_turning = theano.function([g, gmask, y, length],
                                                 test_cost,
                                                 updates=test_updates,
                                                 on_unused_input='ignore')
        self.test_time1 = theano.function([g, gmask, y, y_in, length], [
            acc,
            T.mean(neg_cost),
            T.mean(pos_cost), params.l3 * T.mean(ce_hinge)
        ],
                                          on_unused_input='ignore')
        self.test_time = theano.function([g, gmask, y, length], acc)
        self.test_time2 = theano.function([g, gmask, length], pred)
Ejemplo n.º 16
0
    def __init__(self, params, data):

        self.get_pos_map(data)
        self.cap = params.cap
        self.lowercase = params.lowercase
        self.featuretype = params.featuretype

        chardim = params.chardim  #dimension of character network layer
        worddim = params.worddim  #dimension of character embedding and word LSTM layer

        if not params.nntype == "charagram":
            self.chars = self.get_character_dict(data)
            Ce = lasagne.init.Uniform(range=0.5 / len(self.chars))
            Ce_np = Ce.sample((len(self.chars), params.worddim))
            Ce = theano.shared(np.asarray(Ce_np, dtype=config.floatX))

        char = T.imatrix()
        charmask = T.matrix()
        word = T.imatrix()
        wordmask = T.matrix()

        idxs = T.ivector()
        Y = T.matrix()

        l_in_char = lasagne.layers.InputLayer((None, None))
        if params.nntype == "charlstm":
            l_mask_char = lasagne.layers.InputLayer(shape=(None, None))
            l_emb_char = lasagne.layers.EmbeddingLayer(
                l_in_char,
                input_size=Ce.get_value().shape[0],
                output_size=Ce.get_value().shape[1],
                W=Ce)
            l_lstm_char = lasagne.layers.LSTMLayer(l_emb_char,
                                                   chardim,
                                                   peepholes=True,
                                                   learn_init=False,
                                                   mask_input=l_mask_char)
            if not params.outgate:
                l_lstm_char = lasagne_lstm_nooutput(l_emb_char,
                                                    chardim,
                                                    peepholes=True,
                                                    learn_init=False,
                                                    mask_input=l_mask_char)
            l_We = lasagne.layers.SliceLayer(l_lstm_char, -1, 1)
            We = lasagne.layers.get_output(l_We, {
                l_in_char: char,
                l_mask_char: charmask
            })
        elif params.nntype == "charagram":
            char = T.matrix()
            self.featuremap = self.get_feature_map(data, params.featuretype,
                                                   params.cutoff,
                                                   params.lowercase)
            print "Number of features: ", len(self.featuremap)

            l_in_char = lasagne.layers.InputLayer(
                (None, len(self.featuremap) + 1))
            if self.cap:
                l_in_char = lasagne.layers.InputLayer(
                    (None, len(self.featuremap) + 2))
            l_1 = lasagne.layers.DenseLayer(l_in_char,
                                            chardim,
                                            nonlinearity=params.act)
            if params.numlayers == 1:
                l_We = lasagne.layers.DenseLayer(l_in_char,
                                                 chardim,
                                                 nonlinearity=params.act)
            elif params.numlayers == 2:
                l_We = lasagne.layers.DenseLayer(l_1,
                                                 chardim,
                                                 nonlinearity=params.act)
            else:
                raise ValueError('Only 1-2 layers are supported currently.')
            We = lasagne.layers.get_output(l_We, {l_in_char: char})
        elif params.nntype == "charcnn":
            l_emb_char = lasagne.layers.EmbeddingLayer(
                l_in_char,
                input_size=Ce.get_value().shape[0],
                output_size=Ce.get_value().shape[1],
                W=Ce)
            emb = lasagne.layers.DimshuffleLayer(l_emb_char, (0, 2, 1))
            conv_params = None
            if params.conv_type == 1:
                conv_params = [(175, 2), (175, 3), (175, 4)]
            else:
                conv_params = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5),
                               (150, 6)]
            layers = []
            for num_filters, filter_size in conv_params:
                conv = lasagne.layers.Conv1DLayer(emb,
                                                  num_filters,
                                                  filter_size,
                                                  nonlinearity=params.act)
                pl = lasagne.layers.GlobalPoolLayer(conv, theano.tensor.max)
                pl = lasagne.layers.FlattenLayer(pl)
                layers.append(pl)
            concat = lasagne.layers.ConcatLayer(layers)
            l_We = lasagne.layers.DenseLayer(concat,
                                             num_units=chardim,
                                             nonlinearity=params.act)
            We = lasagne.layers.get_output(l_We, {l_in_char: char})
        else:
            l_We = None
            We = None

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))
        l_emb_word = lasagne_embedding_layer_2(l_in_word, chardim, We)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                worddim,
                                                peepholes=True,
                                                learn_init=False,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                worddim,
                                                peepholes=True,
                                                learn_init=False,
                                                mask_input=l_mask_word,
                                                backwards=True)

        l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, worddim))
        l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, worddim))
        concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb])
        l_emb = lasagne.layers.DenseLayer(
            concat2,
            num_units=worddim,
            nonlinearity=lasagne.nonlinearities.tanh)
        l_out = lasagne.layers.DenseLayer(
            l_emb,
            num_units=len(self.tags),
            nonlinearity=lasagne.nonlinearities.softmax)
        embg = lasagne.layers.get_output(l_out, {
            l_in_word: word,
            l_mask_word: wordmask
        })

        embg = embg[idxs]
        prediction = T.argmax(embg, axis=1)

        self.all_params = lasagne.layers.get_all_params(
            l_out, trainable=True) + lasagne.layers.get_all_params(
                l_We, trainable=True)
        reg = 0.5 * params.LC * sum(
            lasagne.regularization.l2(x) for x in self.all_params)

        cost = T.nnet.categorical_crossentropy(embg, Y)
        cost = T.mean(cost) + reg

        self.feedforward_function = None
        self.scoring_function = None
        self.cost_function = None
        self.train_function = None

        if params.nntype == "charlstm":
            self.feedforward_function = theano.function(
                [char, charmask, word, wordmask, idxs], embg)
            self.scoring_function = theano.function(
                [char, charmask, word, wordmask, idxs], prediction)
            self.cost_function = theano.function(
                [char, charmask, word, wordmask, idxs, Y], cost)
            grads = theano.gradient.grad(cost, self.all_params)
            updates = lasagne.updates.momentum(
                grads, self.all_params, 0.2,
                momentum=0.95)  #same as Ling et al.
            self.train_function = theano.function(
                [char, charmask, word, wordmask, idxs, Y],
                cost,
                updates=updates)
        elif params.nntype == "charcnn" or params.nntype == "charagram":
            self.feedforward_function = theano.function(
                [char, word, wordmask, idxs], embg)
            self.scoring_function = theano.function(
                [char, word, wordmask, idxs], prediction)
            self.cost_function = theano.function(
                [char, word, wordmask, idxs, Y], cost)
            grads = theano.gradient.grad(cost, self.all_params)
            updates = lasagne.updates.momentum(
                grads, self.all_params, 0.2,
                momentum=0.95)  #same as Ling et al.
            self.train_function = theano.function(
                [char, word, wordmask, idxs, Y], cost, updates=updates)