Ejemplo n.º 1
0
def test_accuracy_instance():
    from .metrics import accuracy_instance

    predictions_var, targets_var = T.imatrices('predictions', 'targets')
    accuracy_var = accuracy_instance(predictions_var, targets_var, \
        nb_classes=5, nb_samples_per_class=10, batch_size=16)
    accuracy_fn = theano.function([predictions_var, targets_var], accuracy_var)

    # Generate sample data
    targets = np.kron(np.arange(5), np.ones((16, 10))).astype('int32')
    predictions = np.zeros((16, 50)).astype('int32')

    indices = np.zeros((16, 5)).astype('int32')
    accuracy = np.zeros((16, 10))

    for i in range(16):
        for j in range(50):
            correct = np.random.binomial(1, 0.5)
            predictions[i, j] = correct * targets[i, j] + \
                (1 - correct) * ((targets[i, j] + 1) % 5)
            accuracy[i, indices[i, targets[i, j]]] += correct
            indices[i, targets[i, j]] += 1
    numpy_accuracy = np.mean(accuracy, axis=0) / 5
    theano_accuracy = accuracy_fn(predictions, targets)

    assert np.allclose(theano_accuracy, numpy_accuracy)
Ejemplo n.º 2
0
def test_array():
    x = T.imatrices("x")
    y = T.log(x)[[0, 1], 0]
    f = theano.function(outputs=y, inputs=[x])
    print f([[1, 2, 3, 4, 5],
             [5, 4, 3, 2, 1]
    ])
Ejemplo n.º 3
0
def test_batch_size():
    input_var_1, input_var_2 = T.tensor3s('input1', 'input2')
    target_var_1, target_var_2 = T.imatrices('target1', 'target2')
    # First model with `batch_size=16`
    output_var_1, _, params1 = memory_augmented_neural_network(
        input_var_1,
        target_var_1,
        batch_size=16,
        nb_class=5,
        memory_shape=(128, 40),
        controller_size=200,
        input_size=20 * 20,
        nb_reads=4)
    # Second model with `batch_size=1`
    output_var_2, _, params2 = memory_augmented_neural_network(
        input_var_2,
        target_var_2,
        batch_size=1,
        nb_class=5,
        memory_shape=(128, 40),
        controller_size=200,
        input_size=20 * 20,
        nb_reads=4)

    for (param1, param2) in zip(params1, params2):
        param2.set_value(param1.get_value())

    posterior_fn1 = theano.function([input_var_1, target_var_1], output_var_1)
    posterior_fn2 = theano.function([input_var_2, target_var_2], output_var_2)

    # Input has shape (batch_size, timesteps, vocabulary_size + actions_vocabulary_size + 3)
    test_input = np.random.rand(16, 50, 20 * 20)
    test_target = np.random.randint(5, size=(16, 50)).astype('int32')

    test_output1 = posterior_fn1(test_input, test_target)
    test_output2 = np.zeros_like(test_output1)

    for i in range(16):
        test_output2[i] = posterior_fn2(test_input[i][np.newaxis, :, :],
                                        test_target[i][np.newaxis, :])

    assert np.allclose(test_output1, test_output2)
Ejemplo n.º 4
0
def test_batch_size():
    input_var_1, input_var_2 = T.tensor3s('input1', 'input2')
    target_var_1, target_var_2 = T.imatrices('target1', 'target2')
    # First model with `batch_size=16`
    output_var_1, _, params1 = memory_augmented_neural_network(
        input_var_1, target_var_1,
        batch_size=16,
        nb_class=5,
        memory_shape=(128, 40),
        controller_size=200,
        input_size=20 * 20,
        nb_reads=4)
    # Second model with `batch_size=1`
    output_var_2, _, params2 = memory_augmented_neural_network(
        input_var_2, target_var_2,
        batch_size=1,
        nb_class=5,
        memory_shape=(128, 40),
        controller_size=200,
        input_size=20 * 20,
        nb_reads=4)

    for (param1, param2) in zip(params1, params2):
        param2.set_value(param1.get_value())

    posterior_fn1 = theano.function([input_var_1, target_var_1], output_var_1)
    posterior_fn2 = theano.function([input_var_2, target_var_2], output_var_2)

    # Input has shape (batch_size, timesteps, vocabulary_size + actions_vocabulary_size + 3)
    test_input = np.random.rand(16, 50, 20 * 20)
    test_target = np.random.randint(5, size=(16, 50)).astype('int32')

    test_output1 = posterior_fn1(test_input, test_target)
    test_output2 = np.zeros_like(test_output1)

    for i in range(16):
        test_output2[i] = posterior_fn2(test_input[i][np.newaxis, :, :], test_target[i][np.newaxis, :])

    assert np.allclose(test_output1, test_output2)
Ejemplo n.º 5
0
    def test_clone(self):
        # Data for unit testing
        X_unit = ['abcdef', 'abcdef', 'qwerty']
        X_unit = [[ord(c) for c in w] for w in X_unit]
        X_unit = np.array(X_unit, dtype='int8')
        n_alerts_unit, l_alerts_unit = X_unit.shape
        mask_unit = np.ones(X_unit.shape, dtype='int8')

        # Dimensions
        n_alerts = None
        l_alerts = None
        n_alphabet = 2**7  # All ASCII chars
        num_units = 10

        # Symbolic variables
        input_var, input_var2 = T.imatrices('inputs', 'inputs2')
        mask_var, mask_var2 = T.matrices('masks', 'masks2')
        target_var = T.dvector('targets')

        # build net for testing
        l_in = InputLayer(shape=(n_alerts, l_alerts),
                          input_var=input_var,
                          name='INPUT-LAYER')
        l_emb = EmbeddingLayer(l_in,
                               n_alphabet,
                               n_alphabet,
                               W=np.eye(n_alphabet),
                               name='EMBEDDING-LAYER')
        l_emb.params[l_emb.W].remove('trainable')  # Fix weight
        l_mask = InputLayer(shape=(n_alerts, l_alerts),
                            input_var=mask_var,
                            name='MASK-INPUT-LAYER')
        l_lstm = LSTMLayer(l_emb,
                           num_units=num_units,
                           name='LSTM-LAYER',
                           mask_input=l_mask)
        l_slice = SliceLayer(l_lstm, indices=-1, axis=1,
                             name="SLICE-LAYER")  # Only last timestep

        net = l_slice

        # clone
        l_in2 = InputLayer(shape=(n_alerts, l_alerts),
                           input_var=input_var2,
                           name='INPUT-LAYER2')
        l_mask2 = InputLayer(shape=(n_alerts, l_alerts),
                             input_var=mask_var2,
                             name='MASK-INPUT-LAYER2')
        net2 = lstm_rnn_tied_weights.clone(net, l_in2, l_mask2)

        self.assertNotEqual(repr(net), repr(net2))

        pred_unit = layers.get_output(net,
                                      inputs={
                                          l_in: input_var,
                                          l_mask: mask_var
                                      }).eval({
                                          input_var: X_unit,
                                          mask_var: mask_unit
                                      })

        pred_unit2 = layers.get_output(net2,
                                       inputs={
                                           l_in2: input_var2,
                                           l_mask2: mask_var2
                                       }).eval({
                                           input_var2: X_unit,
                                           mask_var2: mask_unit
                                       })

        self.assert_array_equal(pred_unit, pred_unit2)
	def __init__(self,  We_initial, char_embedd_table_initial, params):

		We = theano.shared(We_initial)
 
                # initial embedding for the InfNet
                We_inf = theano.shared(We_initial)
        	embsize = We_initial.shape[1]
        	hidden = params.hidden
		self.en_hidden_size = params.hidden_inf
		self.num_labels = 17
		self.de_hidden_size = params.de_hidden_size
		

                char_embedd_dim = params.char_embedd_dim
                char_dic_size = len(params.char_dic)
                char_embedd_table = theano.shared(char_embedd_table_initial)
                char_embedd_table_inf = theano.shared(char_embedd_table_initial)


		input_var = T.imatrix(name='inputs')
        	target_var = T.imatrix(name='targets')
		target_var_in = T.imatrix(name='targets')
        	mask_var = T.fmatrix(name='masks')
		mask_var1 = T.fmatrix(name='masks1')
                char_input_var = T.itensor3(name='char-inputs')

		length = T.iscalar()
		length0 = T.iscalar()
		t_t = T.fscalar()
		t_t0 = T.fscalar()		

                use_dropout = T.fscalar()
                use_dropout0 = T.fscalar()

		Wyy0 = np.random.uniform(-0.02, 0.02, (self.num_labels +1 , self.num_labels + 1)).astype('float32')
                Wyy = theano.shared(Wyy0)


                l_in_word = lasagne.layers.InputLayer((None, None))
                l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

		if params.emb ==1:
                        l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word,  input_size= We_initial.shape[0] , output_size = embsize, W =We, name='word_embedding')
                else:
                        l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

                layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ),
                                                     input_var=char_input_var, name='char-input')

                layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
                layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char, input_size=char_dic_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table,
                                                             name='char_embedding')

                layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1))


                # first get some necessary dimensions or parameters
                conv_window = 3
                num_filters = params.num_filters

                # construct convolution layer
                cnn_layer = lasagne.layers.Conv1DLayer(layer_char, num_filters=num_filters, filter_size=conv_window, pad='full',
                                           nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
                # infer the pool size for pooling (pool size should go through all time step of cnn)
                _, _, pool_size = cnn_layer.output_shape

                # construct max pool layer
                pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size)
                # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
                output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1]))

                # finally, concatenate the two incoming layers together.
                incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2)

           

		l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word)
        	l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, backwards = True)

        	concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)
		
		l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden))

		l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= self.num_labels + 1, nonlinearity=lasagne.nonlinearities.linear)

		
		network_params = lasagne.layers.get_all_params(l_local, trainable=True)
                network_params.append(Wyy)

		
		print len(network_params)
		f = open('NER_BiLSTM_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.005_0.0_50_hidden_200.pickle','r')
		data = pickle.load(f)
		f.close()

		for idx, p in enumerate(network_params):

                        p.set_value(data[idx])


		self.params = []
		self.hos = []
                self.Cos = []
		self.encoder_lstm_layers = []
                self.decoder_lstm_layers = []
		self.lstm_layers_num = 1		

		ei, di, dt = T.imatrices(3)    #place holders
                decoderInputs0 ,em, em1, dm, tf, di0 =T.fmatrices(6)
		ci = T.itensor3()

		#### the last one is for the stary symbole
                self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform(self.num_labels +1, self.de_hidden_size), borrow=True)

                self.linear = theano.shared(name="Linear", value = init_xavier_uniform(self.de_hidden_size + 2*self.en_hidden_size, self.num_labels), borrow= True)
		self.linear_bias = theano.shared(name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, )*0., dtype=theano.config.floatX), borrow=True)
                #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True)
		
                #self.hidden_bias = theano.shared(
                #        name="Hidden to Bias",
                #        value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) ,
                #        borrow=True
                #        )

       

		input_var_shuffle = input_var.dimshuffle(1, 0)
		mask_var_shuffle = mask_var.dimshuffle(1, 0)
		target_var_in_shuffle = target_var_in.dimshuffle(1,0)
		target_var_shuffle = target_var.dimshuffle(1,0)


		self.params += [We_inf, self.linear, self.de_lookuptable, self.linear_bias] 
                
                ######[batch, sent_length, embsize] 
		state_below = We_inf[input_var_shuffle.flatten()].reshape((input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize))
                
                ###### character word embedding
                layer_char_input_inf = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ),
                                                     input_var=char_input_var, name='char-input')
                layer_char_inf = lasagne.layers.reshape(layer_char_input_inf, (-1, [2]))
                layer_char_embedding_inf = lasagne.layers.EmbeddingLayer(layer_char_inf, input_size=char_dic_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table_inf,
                                                             name='char_embedding_inf')

                layer_char_inf = lasagne.layers.DimshuffleLayer(layer_char_embedding_inf, pattern=(0, 2, 1))
                #layer_char_inf = lasagne.layers.DropoutLayer(layer_char_inf, p=0.5)

                cnn_layer_inf = lasagne.layers.Conv1DLayer(layer_char_inf, num_filters=num_filters, filter_size=conv_window, pad='full',
                                           nonlinearity=lasagne.nonlinearities.tanh, name='cnn_inf')
               
                pool_layer_inf = lasagne.layers.MaxPool1DLayer(cnn_layer_inf, pool_size=pool_size)
                output_cnn_layer_inf = lasagne.layers.reshape(pool_layer_inf, (-1, length, [1]))
                char_params = lasagne.layers.get_all_params(output_cnn_layer_inf, trainable=True)
                self.params += char_params          
 
                ###### [batch, sent_length, num_filters]
                #char_state_below = lasagne.layers.get_output(output_cnn_layer_inf, {layer_char_input_inf:char_input_var})
                char_state_below = lasagne.layers.get_output(output_cnn_layer_inf)

       
                char_state_below = dropout_layer(char_state_below, use_dropout, trng)
                
                char_state_shuff = char_state_below.dimshuffle(1,0, 2) 
                state_below = T.concatenate([state_below, char_state_shuff], axis=2)
                
                state_below = dropout_layer(state_below, use_dropout, trng)

		enclstm_f = LSTM(embsize+num_filters, self.en_hidden_size)
                enclstm_b = LSTM(embsize+num_filters, self.en_hidden_size, True)
                self.encoder_lstm_layers.append(enclstm_f)    #append
                self.encoder_lstm_layers.append(enclstm_b)    #append
                self.params += enclstm_f.params + enclstm_b.params   #concatenate

                hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle)
                hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle)

                hs = T.concatenate([hs_f, hs_b], axis=2)
                Cs = T.concatenate([Cs_f, Cs_b], axis=2)

		hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1)
                Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1)
		#self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
                #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
                self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size),
                self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size),
		
		Encoder = hs
                	
		state_below = self.de_lookuptable[target_var_in_shuffle.flatten()].reshape((target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size))

		for i in range(self.lstm_layers_num):
                        declstm = LSTM(self.de_hidden_size, self.de_hidden_size)
                        self.decoder_lstm_layers += declstm,    #append
                        self.params += declstm.params    #concatenate
                        ho, Co = self.hos[i], self.Cos[i]
                        state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co)		
		

		decoder_lstm_outputs = T.concatenate([state_below, Encoder], axis=2)
		linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :]
                softmax_outputs, updates = theano.scan(
                        fn=lambda x: T.nnet.softmax(x),
                        sequences=[linear_outputs],
                        )

		def _NLL(pred, y, m):
                        return -m * T.log(pred[T.arange(input_var.shape[0]), y])

		"""
		costs, _ = theano.scan(fn=_NLL, sequences=[softmax_outputs, target_var_shuffle, mask_var_shuffle])
                #loss = costs.sum() / mask_var.sum() + params.L2*sum(lasagne.regularization.l2(x) for x in self.params)
		loss = costs.sum() / mask_var.sum()		

                updates = lasagne.updates.sgd(loss, self.params, self.eta)
                updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9)

		###################################################
                #### using the ground truth when training
                ##################################################
                self._train = theano.function(
                        inputs=[ei, em, di, dm, dt],
                        outputs=[loss, softmax_outputs],
                        updates=updates,
                        givens={input_var:ei, mask_var:em, target_var_in:di, decoderMask:dm, target_var:dt}
                        )
		"""
	

		def _step2(ctx_, state_, hs_, Cs_):

                        hs, Cs = [], []
                        token_idxs = T.cast(state_.argmax(axis=-1), "int32" )
                        msk_ = T.fill( (T.zeros_like(token_idxs, dtype="float32")), 1.)
                        msk_ = msk_.dimshuffle('x', 0)
                        state_below0 = self.de_lookuptable[token_idxs].reshape((1, ctx_.shape[0], self.de_hidden_size))
                        for i, lstm in enumerate(self.decoder_lstm_layers):
                                h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i])    #mind msk
                                hs += h[-1],
                                Cs += C[-1],
                                state_below0 = h

                        hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs)
			state_below0 = state_below0.reshape((ctx_.shape[0], self.de_hidden_size))
                        state_below0 = T.concatenate([ctx_, state_below0], axis =1)			

                        newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :]
                        state_below = T.nnet.softmax(newpred)
			##### the beging symbole probablity is 0
                        extra_p = T.zeros_like(hs[:,:,0])
                        state_below = T.concatenate([state_below, extra_p.T], axis=1)


                        return state_below, hs, Cs


		hs0, Cs0 = T.as_tensor_variable(self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0")

                train_outputs, _ = theano.scan(
                        fn=_step2,
			sequences = [Encoder],
                        outputs_info=[decoderInputs0, hs0, Cs0],
                        n_steps=input_var_shuffle.shape[0]
                        )

                predy = train_outputs[0].dimshuffle(1, 0 , 2)
		predy = predy[:,:,:-1]*mask_var[:,:,None]
		predy0 = predy.reshape((-1, 17))
          
 

	
		def inner_function( targets_one_step, mask_one_step,  prev_label, tg_energy):
                        """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """                 
                        new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1])
                        new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1)
			tg_energy_t = T.switch(mask_one_step, new_ta_energy_t,  tg_energy)

                        return [targets_one_step, tg_energy_t]


		local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var})
		local_energy = local_energy.reshape((-1, length, 17))
                local_energy = local_energy*mask_var[:,:,None]		

		#####################
		# for the end symbole of a sequence
		####################

		end_term = Wyy[:-1,-1]
                local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None]


		#predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var})

		predy_in = T.argmax(predy0, axis=1)
                A = T.extra_ops.to_one_hot(predy_in, 17)
                A = A.reshape((-1, length, 17))		

		#predy = predy0.reshape((-1, length, 25))
		#predy = predy*mask_var[:,:,None]

		
		targets_shuffled = predy.dimshuffle(1, 0, 2)
                target_time0 = targets_shuffled[0]
		
		masks_shuffled = mask_var.dimshuffle(1, 0)		 

                initial_energy0 = T.dot(target_time0, Wyy[-1,:-1])


                initials = [target_time0, initial_energy0]
                [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]])
                cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1)

		
                cost = T.mean(-cost11)		
  
				
		from momentum import momentum
                updates_a = momentum(cost, self.params, params.eta, momentum=0.9)

                self.train_fn = theano.function(
                                inputs=[ei, ci, em, em1, length0, di0, use_dropout0],
                                outputs=[cost],
                                updates=updates_a,
                                on_unused_input='ignore',
                                givens={input_var:ei, char_input_var:ci, mask_var:em, mask_var1:em1, length: length0, decoderInputs0:di0, use_dropout:use_dropout0}
                                )


	
		
		prediction = T.argmax(predy, axis=2)
		corr = T.eq(prediction, target_var)
        	corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        	num_tokens = mask_var.sum(dtype=theano.config.floatX)

		self.eval_fn = theano.function(
                                inputs=[ei, ci, em, em1, length0, di0, use_dropout0],
                                outputs=[prediction, -cost11],
				on_unused_input='ignore',
                                givens={input_var:ei, char_input_var:ci, mask_var:em, mask_var1:em1, length: length0, decoderInputs0:di0, use_dropout:use_dropout0}
                                )        	
Ejemplo n.º 7
0
    def __init__(self,
                 hidden_size=4,
                 nclasses=3,
                 num_embeddings=1000,
                 embedding_dim=2,
                 window_size=7,
                 memory_size=6,
                 n_memory_slots=8):

        questions, docs = T.imatrices('questions', 'docs')
        y_true_matrix = T.imatrix('y_true')

        n_question_slots = int(n_memory_slots / 4)  # TODO derive this from an arg
        n_doc_slots = n_memory_slots - n_question_slots
        n_instances = questions.shape[0]

        self.window_size = window_size

        randoms = {
            # attr: shape
            'emb': (num_embeddings + 1, embedding_dim),
            'Wg_q': (window_size * embedding_dim, n_question_slots),
            'Wg_d': (window_size * embedding_dim, n_doc_slots),
            'Wk': (hidden_size, memory_size),
            'Wb': (hidden_size, 1),
            'Wv': (hidden_size, memory_size),
            'We_q': (hidden_size, n_question_slots),
            'We_d': (hidden_size, n_doc_slots),
            'Wx': (window_size * embedding_dim, hidden_size),
            'Wh': (memory_size, hidden_size),
            'W': (hidden_size, nclasses),
            'h0': hidden_size,
            'w_q': (n_question_slots,),
            'w_d': (n_doc_slots,),
            'M_q': (memory_size, n_question_slots),
            # TODO can we set M0 to zeros without having issues with cosine_dist?
            'M_d': (memory_size, n_doc_slots)  # TODO can we set M0 to zeros without having issues with cosine_dist?
        }

        zeros = {
            # attr: shape
            'bh': hidden_size,
            'bg_q': n_question_slots,
            'bg_d': n_doc_slots,
            'bk': memory_size,
            'bb': 1,
            'bv': memory_size,
            'be_q': n_question_slots,
            'be_d': n_doc_slots,
            'b': nclasses
        }

        def random_shared(shape):
            return theano.shared(
                0.2 * numpy.random.uniform(-1.0, 1.0, shape).astype(theano.config.floatX))

        def zeros_shared(shape):
            return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX))

        for key in randoms:
            # create an attribute with associated shape and random values
            setattr(self, key, random_shared(randoms[key]))

        for key in zeros:
            # create an attribute with associated shape and values = 0
            setattr(self, key, zeros_shared(zeros[key]))

        def repeat_for_each_instance(param):
            """ repeat param along new axis once for each instance """
            return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0)

        for key in 'h0 w_q M_q w_d M_d'.split():
            setattr(self, key, repeat_for_each_instance(self.__getattribute__(key)))

        self.names = zeros.keys() + randoms.keys()
        self.params = [eval('self.' + name) for name in 'bh'.split()]

        def recurrence(i, h_tm1, w_q, M_q, w_d=None, M_d=None, is_question=True):
            """
            notes
            Headers from paper in all caps
            mem = n_question slots if is_question else n_doc_slots

            :param i: center index of sliding window
            :param h_tm1: h_{t-1} (hidden state)
            :param w_q: attention weights for question memory
            :param M_q: question memory
            :param w_d: attention weights for docs memory
            :param M_d: docs memory
            :param is_question: we use different parts of memory when working with a question
            :return: [y_t = model outputs,
                      i + 1 = increment index,
                      h_t w_t, M_t (see above)]
            """
            if not is_question:
                assert w_d is not None and M_d is not None

            # get representation of word window
            idxs = questions if is_question else docs  # [instances, bucket_width]
            pad = T.zeros((idxs.shape[0], self.window_size // 2), dtype='int32')
            padded = T.concatenate([pad, idxs, pad], axis=1)
            window = padded[:, i:i + window_size]  # [instances, window_size]
            x_t = self.emb[window].flatten(ndim=2)  # [instances, window_size * embedding_dim]

            # EXTERNAL MEMORY READ
            # eqn 15

            if is_question:
                M_read = M_q  # [instances, memory_size, n_question_slots]
                w_read = w_q  # [instances, n_question_slots]
            else:
                M_read = T.concatenate([M_q, M_d], axis=2)  # [instances, memory_size, n_doc_slots]
                w_read = T.concatenate([w_q, w_d], axis=1)  # [instances, n_doc_slots]

            c = T.batched_dot(M_read, w_read)  # [instances, memory_size]

            def get_attention(Wg, bg, M, w):
                g_t = T.nnet.sigmoid(T.dot(x_t, Wg) + bg)  # [instances, mem]

                # eqn 11
                k = T.dot(h_tm1, self.Wk) + self.bk  # [instances, memory_size]

                # eqn 13
                beta = T.dot(h_tm1, self.Wb) + self.bb
                beta = T.log(1 + T.exp(beta))
                beta = T.addbroadcast(beta, 1)  # [instances, 1]

                # eqn 12
                w_hat = T.nnet.softmax(beta * cosine_dist(M, k))

                # eqn 14
                return (1 - g_t) * w + g_t * w_hat  # [instances, mem]

            w_q = get_attention(self.Wg_q, self.bg_q, M_q, w_q)  # [instances, n_question_slots]
            if not is_question:
                w_d = get_attention(self.Wg_d, self.bg_d, M_d, w_d)  # [instances, n_doc_slots]

            # MODEL INPUT AND OUTPUT
            # eqn 9
            h_t = T.dot(x_t, self.Wx) + T.dot(c, self.Wh) + self.bh  # [instances, hidden_size]

            # eqn 10
            y_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b)  # [instances, nclasses]

            # EXTERNAL MEMORY UPDATE
            def update_memory(We, be, w_update, M_update):
                # eqn 17
                e = T.nnet.sigmoid(T.dot(h_tm1, We) + be)  # [instances, mem]
                f = 1. - w_update * e  # [instances, mem]

                # eqn 16
                v_t = T.dot(h_t, self.Wv) + self.bv  # [instances, memory_size]

                # need to add broadcast layers for memory update
                f_t = f.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                u_t = w_update.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                v_t = v_t.dimshuffle(0, 1, 'x')  # [instances, memory_size, 1]

                # eqn 19
                return M_update * f_t + T.batched_dot(v_t, u_t)  # [instances, memory_size, mem]

            M_q = update_memory(self.We_q, self.be_q, w_q, M_q)
            attention_and_memory = [w_q, M_q]
            if not is_question:
                M_d = update_memory(self.We_d, self.be_d, w_d, M_d)
                attention_and_memory += [w_d, M_d]
            return [y_t, i + 1, h_t] + attention_and_memory

        outputs_info = [None, T.constant(0, dtype='int32'), self.h0, self.w_q, self.M_q]
        ask_question = partial(recurrence, is_question=True)
        answer_question = partial(recurrence, is_question=False)

        [_, _, h, w, M], _ = theano.scan(fn=ask_question,
                                         outputs_info=outputs_info,
                                         n_steps=questions.shape[1],
                                         name='ask_scan')

        outputs_info[2:] = [param[-1, :, :] for param in (h, w, M)]

        output, _ = theano.scan(fn=answer_question,
                                outputs_info=outputs_info + [self.w_d, self.M_d],
                                n_steps=docs.shape[1],
                                name='train_scan')

        y_dist = output[0].dimshuffle(2, 1, 0).flatten(ndim=2).T
        y_pred = y_dist.argmax(axis=1)
        y_true = y_true_matrix.ravel()
        counts = T.extra_ops.bincount(y_true, assert_nonneg=True)
        weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0)

        losses = T.nnet.categorical_crossentropy(y_dist, y_true)
        loss = lasagne.objectives.aggregate(losses, weights)

        self.test = theano.function(inputs=[questions, docs, y_true_matrix],
                                    outputs=[T.grad(loss, self.bh)])

        updates = lasagne.updates.adadelta(loss, self.params, learning_rate=.001)

        # theano functions
        self.predict = theano.function(inputs=[questions, docs],
                                       outputs=y_pred)

        self.train = theano.function(inputs=[questions, docs, y_true_matrix],
                                     outputs=[y_pred, loss],
                                     updates=updates,
                                     allow_input_downcast=True)

        normalized_embeddings = self.emb / T.sqrt((self.emb ** 2).sum(axis=1)).dimshuffle(0, 'x')
        self.normalize = theano.function(inputs=[],
                                         updates={self.emb: normalized_embeddings})
Ejemplo n.º 8
0
    def __init__(self,
                 hidden_size=4,
                 nclasses=3,
                 num_embeddings=1000,
                 embedding_dim=2,
                 window_size=7,
                 memory_size=6,
                 n_memory_slots=8):

        questions, docs = T.imatrices('questions', 'docs')
        y_true_matrix = T.imatrix('y_true')

        n_question_slots = int(n_memory_slots /
                               4)  # TODO derive this from an arg
        n_doc_slots = n_memory_slots - n_question_slots
        n_instances = questions.shape[0]

        self.window_size = window_size

        randoms = {
            # attr: shape
            'emb': (num_embeddings + 1, embedding_dim),
            'Wg_q': (window_size * embedding_dim, n_question_slots),
            'Wg_d': (window_size * embedding_dim, n_doc_slots),
            'Wk': (hidden_size, memory_size),
            'Wb': (hidden_size, 1),
            'Wv': (hidden_size, memory_size),
            'We_q': (hidden_size, n_question_slots),
            'We_d': (hidden_size, n_doc_slots),
            'Wx': (window_size * embedding_dim, hidden_size),
            'Wh': (memory_size, hidden_size),
            'W': (hidden_size, nclasses),
            'h0': hidden_size,
            'w_q': (n_question_slots, ),
            'w_d': (n_doc_slots, ),
            'M_q': (memory_size, n_question_slots),
            # TODO can we set M0 to zeros without having issues with cosine_dist?
            'M_d': (
                memory_size, n_doc_slots
            )  # TODO can we set M0 to zeros without having issues with cosine_dist?
        }

        zeros = {
            # attr: shape
            'bh': hidden_size,
            'bg_q': n_question_slots,
            'bg_d': n_doc_slots,
            'bk': memory_size,
            'bb': 1,
            'bv': memory_size,
            'be_q': n_question_slots,
            'be_d': n_doc_slots,
            'b': nclasses
        }

        def random_shared(shape):
            return theano.shared(0.2 * numpy.random.uniform(
                -1.0, 1.0, shape).astype(theano.config.floatX))

        def zeros_shared(shape):
            return theano.shared(numpy.zeros(shape,
                                             dtype=theano.config.floatX))

        for key in randoms:
            # create an attribute with associated shape and random values
            setattr(self, key, random_shared(randoms[key]))

        for key in zeros:
            # create an attribute with associated shape and values = 0
            setattr(self, key, zeros_shared(zeros[key]))

        def repeat_for_each_instance(param):
            """ repeat param along new axis once for each instance """
            return T.repeat(T.shape_padleft(param),
                            repeats=n_instances,
                            axis=0)

        for key in 'h0 w_q M_q w_d M_d'.split():
            setattr(self, key,
                    repeat_for_each_instance(self.__getattribute__(key)))

        self.names = zeros.keys() + randoms.keys()
        self.params = [eval('self.' + name) for name in 'bh'.split()]

        def recurrence(i,
                       h_tm1,
                       w_q,
                       M_q,
                       w_d=None,
                       M_d=None,
                       is_question=True):
            """
            notes
            Headers from paper in all caps
            mem = n_question slots if is_question else n_doc_slots

            :param i: center index of sliding window
            :param h_tm1: h_{t-1} (hidden state)
            :param w_q: attention weights for question memory
            :param M_q: question memory
            :param w_d: attention weights for docs memory
            :param M_d: docs memory
            :param is_question: we use different parts of memory when working with a question
            :return: [y_t = model outputs,
                      i + 1 = increment index,
                      h_t w_t, M_t (see above)]
            """
            if not is_question:
                assert w_d is not None and M_d is not None

            # get representation of word window
            idxs = questions if is_question else docs  # [instances, bucket_width]
            pad = T.zeros((idxs.shape[0], self.window_size // 2),
                          dtype='int32')
            padded = T.concatenate([pad, idxs, pad], axis=1)
            window = padded[:, i:i + window_size]  # [instances, window_size]
            x_t = self.emb[window].flatten(
                ndim=2)  # [instances, window_size * embedding_dim]

            # EXTERNAL MEMORY READ
            # eqn 15

            if is_question:
                M_read = M_q  # [instances, memory_size, n_question_slots]
                w_read = w_q  # [instances, n_question_slots]
            else:
                M_read = T.concatenate(
                    [M_q, M_d],
                    axis=2)  # [instances, memory_size, n_doc_slots]
                w_read = T.concatenate([w_q, w_d],
                                       axis=1)  # [instances, n_doc_slots]

            c = T.batched_dot(M_read, w_read)  # [instances, memory_size]

            def get_attention(Wg, bg, M, w):
                g_t = T.nnet.sigmoid(T.dot(x_t, Wg) + bg)  # [instances, mem]

                # eqn 11
                k = T.dot(h_tm1, self.Wk) + self.bk  # [instances, memory_size]

                # eqn 13
                beta = T.dot(h_tm1, self.Wb) + self.bb
                beta = T.log(1 + T.exp(beta))
                beta = T.addbroadcast(beta, 1)  # [instances, 1]

                # eqn 12
                w_hat = T.nnet.softmax(beta * cosine_dist(M, k))

                # eqn 14
                return (1 - g_t) * w + g_t * w_hat  # [instances, mem]

            w_q = get_attention(self.Wg_q, self.bg_q, M_q,
                                w_q)  # [instances, n_question_slots]
            if not is_question:
                w_d = get_attention(self.Wg_d, self.bg_d, M_d,
                                    w_d)  # [instances, n_doc_slots]

            # MODEL INPUT AND OUTPUT
            # eqn 9
            h_t = T.dot(x_t, self.Wx) + T.dot(
                c, self.Wh) + self.bh  # [instances, hidden_size]

            # eqn 10
            y_t = T.nnet.softmax(T.dot(h_t, self.W) +
                                 self.b)  # [instances, nclasses]

            # EXTERNAL MEMORY UPDATE
            def update_memory(We, be, w_update, M_update):
                # eqn 17
                e = T.nnet.sigmoid(T.dot(h_tm1, We) + be)  # [instances, mem]
                f = 1. - w_update * e  # [instances, mem]

                # eqn 16
                v_t = T.dot(h_t, self.Wv) + self.bv  # [instances, memory_size]

                # need to add broadcast layers for memory update
                f_t = f.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                u_t = w_update.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                v_t = v_t.dimshuffle(0, 1, 'x')  # [instances, memory_size, 1]

                # eqn 19
                return M_update * f_t + T.batched_dot(
                    v_t, u_t)  # [instances, memory_size, mem]

            M_q = update_memory(self.We_q, self.be_q, w_q, M_q)
            attention_and_memory = [w_q, M_q]
            if not is_question:
                M_d = update_memory(self.We_d, self.be_d, w_d, M_d)
                attention_and_memory += [w_d, M_d]
            return [y_t, i + 1, h_t] + attention_and_memory

        outputs_info = [
            None,
            T.constant(0, dtype='int32'), self.h0, self.w_q, self.M_q
        ]
        ask_question = partial(recurrence, is_question=True)
        answer_question = partial(recurrence, is_question=False)

        [_, _, h, w, M], _ = theano.scan(fn=ask_question,
                                         outputs_info=outputs_info,
                                         n_steps=questions.shape[1],
                                         name='ask_scan')

        outputs_info[2:] = [param[-1, :, :] for param in (h, w, M)]

        output, _ = theano.scan(fn=answer_question,
                                outputs_info=outputs_info +
                                [self.w_d, self.M_d],
                                n_steps=docs.shape[1],
                                name='train_scan')

        y_dist = output[0].dimshuffle(2, 1, 0).flatten(ndim=2).T
        y_pred = y_dist.argmax(axis=1)
        y_true = y_true_matrix.ravel()
        counts = T.extra_ops.bincount(y_true, assert_nonneg=True)
        weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0)

        losses = T.nnet.categorical_crossentropy(y_dist, y_true)
        loss = lasagne.objectives.aggregate(losses, weights)

        self.test = theano.function(inputs=[questions, docs, y_true_matrix],
                                    outputs=[T.grad(loss, self.bh)])

        updates = lasagne.updates.adadelta(loss,
                                           self.params,
                                           learning_rate=.001)

        # theano functions
        self.predict = theano.function(inputs=[questions, docs],
                                       outputs=y_pred)

        self.train = theano.function(inputs=[questions, docs, y_true_matrix],
                                     outputs=[y_pred, loss],
                                     updates=updates,
                                     allow_input_downcast=True)

        normalized_embeddings = self.emb / T.sqrt(
            (self.emb**2).sum(axis=1)).dimshuffle(0, 'x')
        self.normalize = theano.function(
            inputs=[], updates={self.emb: normalized_embeddings})
Ejemplo n.º 9
0
def test_dot():
    x = T.imatrices("x")
    y = T.dot(x, [[1,2], [1,2], [1,2]])
    f = theano.function(inputs=[x], outputs=y)
    print f([[1, 2, 3], [2, 3, 4]])
Ejemplo n.º 10
0
    def __init__(self, We_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden

        self.num_labels = params.num_labels
        self.de_hidden_size = params.de_hidden_size
        self.en_hidden_size = params.en_hidden_size

        print params.de_hidden_size, hidden, params.num_labels

        self.lstm_layers_num = 1

        input_var = T.imatrix(name='inputs')
        target_var = T.imatrix(name='targets')
        target_var_in = T.imatrix(name='in_targets')
        mask_var = T.fmatrix(name='masks')
        mask_var1 = T.fmatrix(name='masks1')
        length = T.iscalar()
        length0 = T.iscalar()
        t_t = T.fscalar()
        t_t0 = T.fscalar()

        Wyy0 = np.random.uniform(
            -0.02, 0.02,
            (self.num_labels + 1, self.num_labels + 1)).astype('float32')
        Wyy = theano.shared(Wyy0)

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                512,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                512,
                                                mask_input=l_mask_word,
                                                backwards=True)

        concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)

        l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * 512))

        l_local = lasagne.layers.DenseLayer(
            l_reshape_concat,
            num_units=self.num_labels,
            nonlinearity=lasagne.nonlinearities.linear)

        network_params = lasagne.layers.get_all_params(l_local, trainable=True)
        network_params.append(Wyy)

        print len(network_params)
        f = open(
            'ccctag_CRF_Bilstm_Viterbi_.Batchsize_10_dropout_0_LearningRate_0.01_0.0512_tagversoin_2.pickle',
            'r')
        data = pickle.load(f)
        f.close()

        for idx, p in enumerate(network_params):
            #print data[idx].shape
            p.set_value(data[idx])

        self.params = []
        self.hos = []
        self.Cos = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []

        ei, di, dt = T.imatrices(3)  #place holders
        decoderInputs0, em, em1, dm, tf, di0 = T.fmatrices(6)

        #### the last one is for the stary symbole
        self.de_lookuptable = theano.shared(name="Decoder LookUpTable",
                                            value=init_xavier_uniform(
                                                self.num_labels + 1,
                                                self.de_hidden_size),
                                            borrow=True)

        self.linear = theano.shared(
            name="Linear",
            value=init_xavier_uniform(
                self.de_hidden_size + 2 * self.en_hidden_size,
                self.num_labels),
            borrow=True)
        self.linear_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.num_labels, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)
        #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True)

        #self.hidden_bias = theano.shared(
        #        name="Hidden to Bias",
        #        value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) ,
        #        borrow=True
        #        )

        input_var_shuffle = input_var.dimshuffle(1, 0)
        mask_var_shuffle = mask_var.dimshuffle(1, 0)
        target_var_in_shuffle = target_var_in.dimshuffle(1, 0)
        target_var_shuffle = target_var.dimshuffle(1, 0)

        self.params += [self.linear, self.linear_bias,
                        self.de_lookuptable]  #concatenate
        state_below = We[input_var_shuffle.flatten()].reshape(
            (input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize))
        enclstm_f = LSTM(embsize, self.en_hidden_size)
        enclstm_b = LSTM(embsize, self.en_hidden_size, True)
        self.encoder_lstm_layers.append(enclstm_f)  #append
        self.encoder_lstm_layers.append(enclstm_b)  #append
        self.params += enclstm_f.params + enclstm_b.params  #concatenate

        hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle)
        hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle)

        hs = T.concatenate([hs_f, hs_b], axis=2)
        Cs = T.concatenate([Cs_f, Cs_b], axis=2)

        hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1)
        Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1)
        #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
        #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
        self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX),
                            input_var_shuffle.shape[1], self.de_hidden_size),
        self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX),
                            input_var_shuffle.shape[1], self.de_hidden_size),

        Encoder = hs

        ei, di, dt = T.imatrices(3)  #place holders
        em, dm, tf, di0 = T.fmatrices(4)
        self.encoder_function = theano.function(inputs=[ei, em],
                                                outputs=Encoder,
                                                givens={
                                                    input_var: ei,
                                                    mask_var: em
                                                })

        state_below = self.de_lookuptable[
            target_var_in_shuffle.flatten()].reshape(
                (target_var_in_shuffle.shape[0],
                 target_var_in_shuffle.shape[1], self.de_hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.de_hidden_size, self.de_hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, mask_var_shuffle,
                                              ho, Co)

        decoder_lstm_outputs = T.concatenate([Encoder, state_below], axis=2)

        linear_outputs = T.dot(decoder_lstm_outputs,
                               self.linear) + self.linear_bias[None, None, :]
        softmax_outputs, updates = theano.scan(
            fn=lambda x: T.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * T.log(pred[T.arange(input_var.shape[0]), y])

        def _step2(ctx_, state_, hs_, Cs_):

            #print ctx_.shape, state_.shape, hs_.shape, Cs_.shape

            hs, Cs = [], []
            token_idxs = T.cast(state_.argmax(axis=-1), "int32")
            msk_ = T.fill((T.zeros_like(token_idxs, dtype="float32")), 1)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, ctx_.shape[0], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs)
            state_below0 = state_below0.reshape(
                (ctx_.shape[0], self.de_hidden_size))
            state_below0 = T.concatenate([ctx_, state_below0], axis=1)

            newpred = T.dot(state_below0,
                            self.linear) + self.linear_bias[None, :]
            state_below = T.nnet.softmax(newpred)

            extra_p = T.zeros_like(hs[:, :, 0])
            state_below = T.concatenate([state_below, extra_p.T], axis=1)

            return state_below, hs, Cs

        ctx_0, state_0 = T.fmatrices(2)
        hs_0 = T.ftensor3()
        Cs_0 = T.ftensor3()
        state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0)
        self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0],
                                      [state_below_tmp, hs_tmp, Cs_tmp],
                                      name='f_next')

        hs0, Cs0 = T.as_tensor_variable(
            self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0")
        train_outputs, _ = theano.scan(fn=_step2,
                                       sequences=[Encoder],
                                       outputs_info=[decoderInputs0, hs0, Cs0],
                                       n_steps=input_var_shuffle.shape[0])

        predy = train_outputs[0].dimshuffle(1, 0, 2)
        predy = predy[:, :, :-1] * mask_var[:, :, None]
        predy0 = predy.reshape((-1, self.num_labels))

        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy_t = tg_energy + T.sum(
                new_ta_energy * targets_one_step, axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy)

            return [targets_one_step, tg_energy_t]

        local_energy = lasagne.layers.get_output(l_local, {
            l_in_word: input_var,
            l_mask_word: mask_var
        })
        local_energy = local_energy.reshape((-1, length, self.num_labels))
        local_energy = local_energy * mask_var[:, :, None]

        #####################
        # for the end symbole of a sequence
        ####################

        end_term = Wyy[:-1, -1]
        local_energy = local_energy + end_term.dimshuffle(
            'x', 'x', 0) * mask_var1[:, :, None]

        #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var})

        predy_in = T.argmax(predy0, axis=1)
        A = T.extra_ops.to_one_hot(predy_in, self.num_labels)
        A = A.reshape((-1, length, self.num_labels))

        #predy = predy0.reshape((-1, length, 25))
        #predy = predy*mask_var[:,:,None]

        targets_shuffled = predy.dimshuffle(1, 0, 2)
        target_time0 = targets_shuffled[0]

        masks_shuffled = mask_var.dimshuffle(1, 0)

        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])

        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])
        cost11 = target_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * mask_var, axis=1)

        # compute the ground-truth energy

        targets_shuffled0 = A.dimshuffle(1, 0, 2)
        target_time00 = targets_shuffled0[0]

        initial_energy00 = T.dot(target_time00, Wyy[-1, :-1])

        initials0 = [target_time00, initial_energy00]
        [_, target_energies0], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials0,
            sequences=[targets_shuffled0[1:], masks_shuffled[1:]])
        cost110 = target_energies0[-1] + T.sum(
            T.sum(local_energy * A, axis=2) * mask_var, axis=1)

        #predy_f =  predy.reshape((-1, 25))
        y_f = target_var.flatten()

        if (params.annealing == 0):
            lamb = params.L3
        elif (params.annealing == 1):
            lamb = params.L3 * (1 - 0.01 * t_t)

        if (params.regutype == 0):
            ce_hinge = lasagne.objectives.categorical_crossentropy(
                predy0 + eps, y_f)
            ce_hinge = ce_hinge.reshape((-1, length))
            ce_hinge = T.sum(ce_hinge * mask_var, axis=1)
            cost = T.mean(-cost11) + lamb * T.mean(ce_hinge)
        else:

            entropy_term = -T.sum(predy0 * T.log(predy0 + eps), axis=1)
            entropy_term = entropy_term.reshape((-1, length))
            entropy_term = T.sum(entropy_term * mask_var, axis=1)
            cost = T.mean(-cost11) - lamb * T.mean(entropy_term)
        """
		f = open('F0_simple.pickle')
                PARA = pickle.load(f)
                f.close()
                l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params))


                cost = T.mean(-cost11) + params.L2*l2_term
		"""

        ##from adam import adam
        ##updates_a = adam(cost, self.params, params.eta)

        #updates_a = lasagne.updates.sgd(cost, self.params, params.eta)
        #updates_a = lasagne.updates.apply_momentum(updates_a, self.params, momentum=0.9)

        from momentum import momentum
        updates_a = momentum(cost, self.params, params.eta, momentum=0.9)

        if (params.regutype == 0):
            self.train_fn = theano.function(
                inputs=[ei, dt, em, em1, length0, t_t0, di0],
                outputs=[cost, ce_hinge],
                updates=updates_a,
                on_unused_input='ignore',
                givens={
                    input_var: ei,
                    target_var: dt,
                    mask_var: em,
                    mask_var1: em1,
                    length: length0,
                    t_t: t_t0,
                    decoderInputs0: di0
                })
            #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, ce_hinge], updates = updates_a, on_unused_input='ignore')
        else:

            self.train_fn = theano.function(
                inputs=[ei, dt, em, em1, length0, t_t0, di0],
                outputs=[cost, entropy_term],
                updates=updates_a,
                on_unused_input='ignore',
                givens={
                    input_var: ei,
                    target_var: dt,
                    mask_var: em,
                    mask_var1: em1,
                    length: length0,
                    t_t: t_t0,
                    decoderInputs0: di0
                })
            #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore')

        prediction = T.argmax(predy, axis=2)
        corr = T.eq(prediction, target_var)
        corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        num_tokens = mask_var.sum(dtype=theano.config.floatX)

        self.eval_fn = theano.function(
            inputs=[ei, dt, em, em1, length0, di0],
            outputs=[cost11, cost110, corr_train, num_tokens, prediction],
            on_unused_input='ignore',
            givens={
                input_var: ei,
                target_var: dt,
                mask_var: em,
                mask_var1: em1,
                length: length0,
                decoderInputs0: di0
            })
Ejemplo n.º 11
0
    def __init__(self,
                 hidden_size=100,
                 nclasses=73,
                 num_embeddings=11359,
                 embedding_dim=100,
                 window_size=1,
                 memory_size=40,
                 n_memory_slots=8,
                 go_code=1,
                 depth=2,
                 load_dir=None):

        articles, titles = T.imatrices('articles', 'titles')
        n_article_slots = int(n_memory_slots /
                              2)  # TODO derive this from an arg
        n_title_slots = n_memory_slots - n_article_slots
        n_instances = articles.shape[0]

        self.window_size = window_size

        randoms = {
            # attr: shape
            # 'emb': (num_embeddings + 1, embedding_dim),
            'M_a': (memory_size, n_article_slots),
            'M_t': (memory_size, n_title_slots),
            'w_a': (n_article_slots, ),
            'w_t': (n_title_slots, ),
            'Wg_a': (window_size * embedding_dim, n_article_slots),
            'Wg_t': (window_size * embedding_dim, n_title_slots),
            'Wk': (hidden_size, memory_size),
            'Wb': (hidden_size, 1),
            'Wv': (hidden_size, memory_size),
            'We_a': (hidden_size, n_article_slots),
            'We_t': (hidden_size, n_title_slots),
            'Wx': (window_size * embedding_dim, hidden_size),
            'Wh': (memory_size, hidden_size),
            'W': (hidden_size, nclasses),
            'h0': hidden_size
        }

        zeros = {
            # attr: shape
            'bg_a': n_article_slots,
            'bg_t': n_title_slots,
            'bk': memory_size,
            'bb': 1,
            'bv': memory_size,
            'be_a': n_article_slots,
            'be_t': n_title_slots,
            'bh': hidden_size,
            'b': nclasses,
        }

        for l in range(depth):
            randoms['gru' + str(l)] = (1, embedding_dim)

        def random_shared(name):
            shape = randoms[name]
            return theano.shared(
                0.2 *
                np.random.normal(size=shape).astype(theano.config.floatX),
                name=name)

        def zeros_shared(name):
            shape = zeros[name]
            return theano.shared(np.zeros(shape, dtype=theano.config.floatX),
                                 name=name)

        for key in randoms:
            # create an attribute with associated shape and random values
            setattr(self, key, random_shared(key))

        for key in zeros:
            # create an attribute with associated shape and values equal to 0
            setattr(self, key, zeros_shared(key))

        self.names = randoms.keys() + zeros.keys()
        # self.names.remove('emb')  # no need to save or update embeddings
        scan_vars = 'h0 w_a M_a w_t M_t'.split()

        def repeat_for_each_instance(param):
            """ repeat param along new axis once for each instance """
            return T.repeat(T.shape_padleft(param),
                            repeats=n_instances,
                            axis=0)

        for key in scan_vars:
            setattr(self, key,
                    repeat_for_each_instance(self.__getattribute__(key)))
            self.names.remove(key)

        if load_dir is not None:
            with open(os.path.join(load_dir, 'params.pkl')) as handle:
                params = pickle.load(handle)
                self.__dict__.update(params)

        def recurrence(i, h_tm1, w_a, M_a, *args, **kwargs):
            """
            notes
            Headers from paper in all caps
            mem = n_article slots if is_article else n_title_slots

            :param i: center index of sliding window
            :param h_tm1: h_{t-1} (hidden state)
            :param w_a: attention weights for article memory
            :param M_a: article memory
            :param args: gru_weights, maybe w_t, maybe M_t
                   gru_weights: weights with which to initialize GRULayer on each time step
                   w_t: attention weights for titles memory
                   M_t: titles memory
            :param kwargs: is_training, is_article
                   is_training:
                   is_article: we use different parts of memory when working with a article
            :return: [y = model outputs,
                      i + 1 = increment index,
                      h w, M (see above)]
            """
            is_training = kwargs['is_training']
            is_article = kwargs['is_article']
            gru_weights = args[:depth]
            if len(args) > depth:
                w_t = args[depth]
                M_t = args[depth + 1]

            i_type = T.iscalar if is_article or is_training else T.ivector
            assert i.type == i_type

            if not is_article:
                assert w_t is not None and M_t is not None

            word_idxs = i
            if is_article or is_training:
                # get representation of word window
                document = articles if is_article else titles  # [instances, bucket_width]
                word_idxs = document[:, i:i + 1]  # [instances, 1]
            # x_i = self.emb[word_idxs].flatten(ndim=2)  # [instances, embedding_dim]

            input = InputLayer(shape=(None, 1), input_var=word_idxs)
            embed = EmbeddingLayer(input, num_embeddings, embedding_dim)
            gru = GRULayer(incoming=embed,
                           num_units=embedding_dim,
                           hid_init=self.gru0)
            for weight in gru_weights:
                gru = GRULayer(incoming=gru,
                               num_units=embedding_dim,
                               hid_init=weight)
            x_i = get_output(gru).flatten(ndim=2)
            x_i = Print('x_i')(x_i)  # [instances, embedding_dim]

            gru_weights = []

            if is_article:
                M_read = M_a  # [instances, memory_size, n_article_slots]
                w_read = w_a  # [instances, n_article_slots]
            else:
                M_read = T.concatenate(
                    [M_a, M_t],
                    axis=2)  # [instances, memory_size, n_title_slots]
                w_read = T.concatenate([w_a, w_t],
                                       axis=1)  # [instances, n_title_slots]

            # eqn 15
            c = T.batched_dot(M_read, w_read)  # [instances, memory_size]

            # EXTERNAL MEMORY READ
            def get_attention(Wg, bg, M, w):
                g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg)  # [instances, mem]

                # eqn 11
                k = T.dot(h_tm1, self.Wk) + self.bk  # [instances, memory_size]

                # eqn 13
                beta = T.dot(h_tm1, self.Wb) + self.bb
                beta = T.nnet.softplus(beta)
                beta = T.addbroadcast(beta, 1)  # [instances, 1]

                # eqn 12
                w_hat = T.nnet.softmax(beta * cosine_dist(M, k))

                # eqn 14
                return (1 - g) * w + g * w_hat  # [instances, mem]

            w_a = get_attention(self.Wg_a, self.bg_a, M_a,
                                w_a)  # [instances, n_article_slots]
            if not is_article:
                w_t = get_attention(self.Wg_t, self.bg_t, M_t,
                                    w_t)  # [instances, n_title_slots]

            # MODEL INPUT AND OUTPUT
            # eqn 9
            h = T.dot(c, self.Wh) + T.dot(
                x_i, self.Wx) + self.bh  # [instances, hidden_size]

            # eqn 10
            y = T.nnet.softmax(T.dot(h, self.W) +
                               self.b)  # [instances, nclasses]

            # EXTERNAL MEMORY UPDATE
            def update_memory(We, be, w_update, M_update):
                # eqn 17
                e = T.nnet.sigmoid(T.dot(h_tm1, We) + be)  # [instances, mem]
                f = 1. - w_update * e  # [instances, mem]

                # eqn 16
                v = T.tanh(T.dot(h, self.Wv) +
                           self.bv)  # [instances, memory_size]

                # need to add broadcast layers for memory update
                f = f.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                u = w_update.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                v = v.dimshuffle(0, 1, 'x')  # [instances, memory_size, 1]

                # eqn 19
                return M_update * f + T.batched_dot(v, u) * (
                    1 - f)  # [instances, memory_size, mem]

            M_a = update_memory(self.We_a, self.be_a, w_a, M_a)
            attention_and_memory = [w_a, M_a]
            if not is_article:
                M_t = update_memory(self.We_t, self.be_t, w_t, M_t)
                attention_and_memory += [w_t, M_t]

            y_max = y.argmax(axis=1).astype(int32)
            next_idxs = i + 1 if is_training or is_article else y_max
            return [y, y_max, next_idxs, h] + attention_and_memory

        read_article = partial(recurrence, is_training=True, is_article=True)
        # for read_article, it actually doesn't matter whether is_training is true

        i0 = T.constant(0, dtype=int32, name='first_value_of_i')
        gru_weights = [eval('self.gru' + str(l)) for l in range(depth)]
        outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a
                        ] + gru_weights

        [_, _, _, h, w, M], _ = theano.scan(fn=read_article,
                                            outputs_info=outputs_info,
                                            n_steps=articles.shape[1],
                                            name='read_scan')

        produce_title = partial(recurrence, is_training=True, is_article=False)
        outputs_info[3:6] = [param[-1, :, :] for param in (h, w, M)]
        outputs_info.extend([self.w_t, self.M_t])
        bucket_width = titles.shape[
            1] - 1  # subtract 1 because <go> is omitted in y_true
        [y, y_max, _, _, _, _, _,
         _], _ = theano.scan(fn=produce_title,
                             outputs_info=outputs_info,
                             n_steps=bucket_width,
                             name='train_scan')

        # loss and updates
        y_clip = T.clip(y, .01, .99)
        y_flatten = y_clip.dimshuffle(2, 1, 0).flatten(ndim=2).T
        y_true = titles[:, 1:].ravel()  # [:, 1:] in order to omit <go>
        counts = T.extra_ops.bincount(y_true, assert_nonneg=True)
        weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0)
        losses = T.nnet.categorical_crossentropy(y_flatten, y_true)
        loss = objectives.aggregate(losses, weights, mode='sum')
        updates = adadelta(loss, self.params())

        self.learn = theano.function(inputs=[articles, titles],
                                     outputs=[y_max.T, loss],
                                     updates=updates,
                                     allow_input_downcast=True,
                                     name='learn')

        produce_title_test = partial(recurrence,
                                     is_training=False,
                                     is_article=False)

        self.test = theano.function(inputs=[articles, titles],
                                    outputs=[y_max.T],
                                    on_unused_input='ignore')

        outputs_info[2] = T.zeros([n_instances], dtype=int32) + go_code
        [_, y_max, _, _, _, _, _,
         _], _ = theano.scan(fn=produce_title_test,
                             outputs_info=outputs_info,
                             n_steps=bucket_width,
                             name='test_scan')

        self.predict = theano.function(inputs=[articles, titles],
                                       outputs=y_max.T,
                                       name='infer')
    def __init__(self, We, char_embedd_table_initial, params):

        lstm_layers_num = 1
        emb_size = We.shape[1]
        self.eta = params.eta
        self.num_labels = params.num_labels
        self.en_hidden_size = params.en_hidden_size
        self.de_hidden_size = params.de_hidden_size

        self.lstm_layers_num = params.lstm_layers_num
        self._train = None
        self._utter = None
        self.params = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []
        self.hos = []
        self.Cos = []

        char_embedd_dim = params.char_embedd_dim
        char_dic_size = len(params.char_dic)
        char_embedd_table = theano.shared(char_embedd_table_initial)

        encoderInputs = tensor.imatrix()
        decoderInputs, decoderTarget = tensor.imatrices(2)
        encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4)

        char_input_var = tensor.itensor3(name='char-inputs')
        ci = tensor.itensor3()

        use_dropout = tensor.fscalar()
        use_dropout0 = tensor.fscalar()

        self.lookuptable = theano.shared(We)

        #### the last one is for the stary symbole
        self.de_lookuptable = theano.shared(name="Decoder LookUpTable",
                                            value=init_xavier_uniform(
                                                self.num_labels + 1,
                                                self.de_hidden_size),
                                            borrow=True)

        self.linear = theano.shared(
            name="Linear",
            value=init_xavier_uniform(
                self.de_hidden_size + 2 * self.en_hidden_size,
                self.num_labels),
            borrow=True)
        self.linear_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.num_labels, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)

        #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*en_hidden_size, self.de_hidden_size), borrow = True)

        #self.hidden_bias = theano.shared(
        #        name="Hidden to Bias",
        #        value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) ,
        #        borrow=True
        #        )

        #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias]    #concatenate
        self.params += [
            self.lookuptable, self.linear, self.linear_bias,
            self.de_lookuptable
        ]  #the initial hidden state of decoder lstm is zeros
        #(max_sent_size, batch_size, hidden_size)
        state_below = self.lookuptable[encoderInputs.flatten()].reshape(
            (encoderInputs.shape[0], encoderInputs.shape[1], emb_size))

        layer_char_input = lasagne.layers.InputLayer(shape=(None, None,
                                                            Max_Char_Length),
                                                     input_var=char_input_var,
                                                     name='char-input')

        layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char,
            input_size=char_dic_size,
            output_size=char_embedd_dim,
            W=char_embedd_table,
            name='char_embedding')

        layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding,
                                                    pattern=(0, 2, 1))

        # first get some necessary dimensions or parameters
        conv_window = 3
        num_filters = params.num_filters

        # construct convolution layer
        cnn_layer = lasagne.layers.Conv1DLayer(
            layer_char,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        _, _, pool_size = cnn_layer.output_shape

        # construct max pool layer
        pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer,
                                                   pool_size=pool_size)
        # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
        output_cnn_layer = lasagne.layers.reshape(
            pool_layer, (-1, encoderInputs.shape[0], [1]))

        char_params = lasagne.layers.get_all_params(output_cnn_layer,
                                                    trainable=True)
        self.params += char_params

        char_state_below = lasagne.layers.get_output(output_cnn_layer)

        char_state_below = dropout_layer(char_state_below, use_dropout, trng)

        char_state_shuff = char_state_below.dimshuffle(1, 0, 2)
        state_below = tensor.concatenate([state_below, char_state_shuff],
                                         axis=2)
        state_below = dropout_layer(state_below, use_dropout, trng)

        for _ in range(self.lstm_layers_num):

            enclstm_f = LSTM(emb_size + num_filters, self.en_hidden_size)
            enclstm_b = LSTM(emb_size + num_filters, self.en_hidden_size, True)
            self.encoder_lstm_layers.append(enclstm_f)  #append
            self.encoder_lstm_layers.append(enclstm_b)  #append
            self.params += enclstm_f.params + enclstm_b.params  #concatenate

            hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask)
            hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask)

            hs = tensor.concatenate([hs_f, hs_b], axis=2)
            Cs = tensor.concatenate([Cs_f, Cs_b], axis=2)
            hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1)
            Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1)
            #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
            #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
            self.hos += tensor.alloc(
                np.asarray(0., dtype=theano.config.floatX),
                encoderInputs.shape[1], self.de_hidden_size),
            self.Cos += tensor.alloc(
                np.asarray(0., dtype=theano.config.floatX),
                encoderInputs.shape[1], self.de_hidden_size),
            state_below = hs

        Encoder = state_below

        state_below = self.de_lookuptable[decoderInputs.flatten()].reshape(
            (decoderInputs.shape[0], decoderInputs.shape[1],
             self.de_hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.de_hidden_size, self.de_hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)

        ##### Here we include the representation from the decoder
        decoder_lstm_outputs = tensor.concatenate([state_below, Encoder],
                                                  axis=2)

        ei, di, dt = tensor.imatrices(3)  #place holders
        em, dm, tf, di0 = tensor.fmatrices(4)
        #####################################################
        #####################################################
        linear_outputs = tensor.dot(decoder_lstm_outputs,
                                    self.linear) + self.linear_bias[None,
                                                                    None, :]
        softmax_outputs, _ = theano.scan(
            fn=lambda x: tensor.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]),
                                        y])

        costs, _ = theano.scan(
            fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
        loss = costs.sum() / decoderMask.sum() + params.L2 * sum(
            lasagne.regularization.l2(x) for x in self.params)

        #updates = lasagne.updates.adam(loss, self.params, self.eta)
        #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9)

        ###################################################
        #### using the ground truth when training
        ##################################################
        #self._train = theano.function(
        #	inputs=[ei, em, di, dm, dt],
        #	outputs=[loss, softmax_outputs],
        #	updates=updates,
        #	givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt}
        #	)

        #########################################################################
        ### For schedule sampling
        #########################################################################

        ###### always use privous predict as next input
        def _step2(ctx_, state_, hs_, Cs_):

            hs, Cs = [], []
            token_idxs = tensor.cast(state_.argmax(axis=-1), "int32")
            msk_ = tensor.fill(
                (tensor.zeros_like(token_idxs, dtype="float32")), 1.)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, encoderInputs.shape[1], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(
                Cs)
            state_below0 = state_below0.reshape(
                (encoderInputs.shape[1], self.de_hidden_size))
            state_below0 = tensor.concatenate([ctx_, state_below0], axis=1)
            newpred = tensor.dot(state_below0,
                                 self.linear) + self.linear_bias[None, :]
            state_below = tensor.nnet.softmax(newpred)
            ##### the beging symbole probablity is 0
            extra_p = tensor.zeros_like(hs[:, :, 0])
            state_below = tensor.concatenate([state_below, extra_p.T], axis=1)

            return state_below, hs, Cs

        hs0, Cs0 = tensor.as_tensor_variable(
            self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos,
                                                             name="Cs0")
        train_outputs, _ = theano.scan(fn=_step2,
                                       sequences=[Encoder],
                                       outputs_info=[decoderInputs0, hs0, Cs0],
                                       n_steps=encoderInputs.shape[0])

        train_predict = train_outputs[0]
        train_costs, _ = theano.scan(
            fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask])

        train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum(
            lasagne.regularization.l2(x) for x in self.params)

        #from adam import adam
        #train_updates = adam(train_loss, self.params, self.eta)
        #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)
        #train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta)
        #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)
        from momentum import momentum
        train_updates = momentum(train_loss,
                                 self.params,
                                 params.eta,
                                 momentum=0.9)

        self._train2 = theano.function(
            inputs=[ei, ci, em, di0, dm, dt, use_dropout0],
            outputs=[train_loss, train_predict],
            updates=train_updates,
            givens={
                encoderInputs: ei,
                char_input_var: ci,
                encoderMask: em,
                decoderInputs0: di0,
                decoderMask: dm,
                decoderTarget: dt,
                use_dropout: use_dropout0
            }
            #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf}
        )

        listof_token_idx = train_predict.argmax(axis=-1)
        self._utter = theano.function(inputs=[ei, ci, em, di0, use_dropout0],
                                      outputs=listof_token_idx,
                                      givens={
                                          encoderInputs: ei,
                                          char_input_var: ci,
                                          encoderMask: em,
                                          decoderInputs0: di0,
                                          use_dropout: use_dropout0
                                      })
Ejemplo n.º 13
0
    def __init__(self,
                 hidden_size=100,
                 nclasses=73,
                 num_embeddings=11359,
                 embedding_dim=100,
                 window_size=1,  # TODO: do we want some kind of window?
                 memory_size=40,
                 n_memory_slots=8,
                 go_code=1,
                 load_dir=None):

        articles, titles = T.imatrices('articles', 'titles')
        n_article_slots = int(n_memory_slots / 2)  # TODO derive this from an arg
        n_title_slots = n_memory_slots - n_article_slots
        n_instances = articles.shape[0]

        self.window_size = window_size

        randoms = {
            # attr: shape
            'emb': (num_embeddings + 1, embedding_dim),
            'M_a': (memory_size, n_article_slots),
            'M_t': (memory_size, n_title_slots),
            'w_a': (n_article_slots,),
            'w_t': (n_title_slots,),
            'Wg_a': (window_size * embedding_dim, n_article_slots),
            'Wg_t': (window_size * embedding_dim, n_title_slots),
            'Wk': (hidden_size, memory_size),
            'Wb': (hidden_size, 1),
            'Wv': (hidden_size, memory_size),
            'We_a': (hidden_size, n_article_slots),
            'We_t': (hidden_size, n_title_slots),
            'Wx': (window_size * embedding_dim, hidden_size),
            'Wh': (memory_size, hidden_size),
            'W': (hidden_size, nclasses),
            'h0': hidden_size
        }

        zeros = {
            # attr: shape
            'bg_a': n_article_slots,
            'bg_t': n_title_slots,
            'bk': memory_size,
            'bb': 1,
            'bv': memory_size,
            'be_a': n_article_slots,
            'be_t': n_title_slots,
            'bh': hidden_size,
            'b': nclasses,
        }

        def random_shared(name):
            shape = randoms[name]
            return theano.shared(
                0.2 * numpy.random.normal(size=shape).astype(theano.config.floatX),
                name=name)

        def zeros_shared(name):
            shape = zeros[name]
            return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX), name=name)

        for key in randoms:
            # create an attribute with associated shape and random values
            setattr(self, key, random_shared(key))

        for key in zeros:
            # create an attribute with associated shape and values equal to 0
            setattr(self, key, zeros_shared(key))
        if load_dir is not None:
            print('!!!!!!!!!!!!!!')
            self.load(load_dir)

        self.names = randoms.keys() + zeros.keys()
        self.names.remove('emb')  # no need to save or update embeddings
        scan_vars = 'h0 w_a M_a w_t M_t'.split()

        def repeat_for_each_instance(param):
            """ repeat param along new axis once for each instance """
            return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0)

        for key in scan_vars:
            setattr(self, key, repeat_for_each_instance(self.__getattribute__(key)))
            self.names.remove(key)

        def recurrence(i,
                       h_tm1,
                       w_a,
                       M_a,
                       w_t=None,
                       M_t=None,
                       is_training=True,
                       is_article=True):
            """
            notes
            Headers from paper in all caps
            mem = n_article slots if is_article else n_title_slots

            :param i: center index of sliding window
            :param h_tm1: h_{t-1} (hidden state)
            :param w_a: attention weights for article memory
            :param M_a: article memory
            :param w_t: attention weights for titles memory
            :param M_t: titles memory
            :param is_training:
            :param is_article: we use different parts of memory when working with a article
            :return: [y = model outputs,
                      i + 1 = increment index,
                      h w, M (see above)]
            """
            # i_type = T.iscalar if is_article or is_training else T.ivector
            # assert i.type == i_type
            #
            # if not is_article:
            #     assert w_t is not None and M_t is not None
            #
            # word_idxs = i
            # if is_article or is_training:
            #     # get representation of word window
            #     document = articles if is_article else titles  # [instances, bucket_width]
            #     word_idxs = document[:, i]  # [instances, 1]
            # x_i = self.emb[word_idxs].flatten(ndim=2)  # [instances, embedding_dim]
            #
            # if is_article:
            #     M_read = M_a  # [instances, memory_size, n_article_slots]
            #     w_read = w_a  # [instances, n_article_slots]
            # else:
            #     M_read = T.concatenate([M_a, M_t], axis=2)  # [instances, memory_size, n_title_slots]
            #     w_read = T.concatenate([w_a, w_t], axis=1)  # [instances, n_title_slots]
            #
            # # eqn 15
            # c = T.batched_dot(M_read, w_read)  # [instances, memory_size]
            #
            # # EXTERNAL MEMORY READ
            # def get_attention(Wg, bg, M, w):
            #     g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg)  # [instances, mem]
            #
            #     # eqn 11
            #     k = T.dot(h_tm1, self.Wk) + self.bk  # [instances, memory_size]
            #
            #     # eqn 13
            #     beta = T.dot(h_tm1, self.Wb) + self.bb
            #     beta = T.nnet.softplus(beta)
            #     beta = T.addbroadcast(beta, 1)  # [instances, 1]
            #
            #     # eqn 12
            #     w_hat = T.nnet.softmax(beta * cosine_dist(M, k))
            #
            #     # eqn 14
            #     return (1 - g) * w + g * w_hat  # [instances, mem]
            #
            # w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a)  # [instances, n_article_slots]
            # if not is_article:
            #     w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t)  # [instances, n_title_slots]
            #
            # # MODEL INPUT AND OUTPUT
            # # eqn 9
            # h = T.dot(c, self.Wh) + T.dot(x_i, self.Wx) + self.bh  # [instances, hidden_size]
            #
            # # eqn 10
            # y = T.nnet.softmax(T.dot(h, self.W) + self.b)  # [instances, nclasses]
            #
            # # EXTERNAL MEMORY UPDATE
            # def update_memory(We, be, w_update, M_update):
            #     # eqn 17
            #     e = T.nnet.sigmoid(T.dot(h_tm1, We) + be)  # [instances, mem]
            #     f = 1. - w_update * e  # [instances, mem]
            #
            #     # eqn 16
            #     v = T.tanh(T.dot(h, self.Wv) + self.bv)  # [instances, memory_size]
            #
            #     # need to add broadcast layers for memory update
            #     f = f.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
            #     u = w_update.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
            #     v = v.dimshuffle(0, 1, 'x')  # [instances, memory_size, 1]
            #
            #     # eqn 19
            #     return M_update * f + T.batched_dot(v, u) * (1 - f)  # [instances, memory_size, mem]
            #
            # M_a = update_memory(self.We_a, self.be_a, w_a, M_a)
            # attention_and_memory = [w_a, M_a]
            # if not is_article:
            #     M_t = update_memory(self.We_t, self.be_t, w_t, M_t)
            #     attention_and_memory += [w_t, M_t]
            #
            # y_max = y.argmax(axis=1).astype(int32)
            # next_idxs = i + 1 if is_training or is_article else y_max
            # return [y, y_max, next_idxs, h] + attention_and_memory + self.params()
            return self.params()

        read_article = partial(recurrence, is_article=True)
        i0 = T.constant(0, dtype=int32, name='first_value_of_i')
        outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a]

        self.test = theano.function([articles, titles],
                                    recurrence(*outputs_info[2:]),
                                    on_unused_input='ignore')
Ejemplo n.º 14
0
    def __init__(self, hidden_size=100, nclasses=73, num_embeddings=11359, embedding_dim=100, window_size=1,
                 memory_size=40, n_memory_slots=8, go_code=1, depth=2, load_dir=None):

        articles, titles = T.imatrices('articles', 'titles')
        n_article_slots = int(n_memory_slots / 2)  # TODO derive this from an arg
        n_title_slots = n_memory_slots - n_article_slots
        n_instances = articles.shape[0]

        self.window_size = window_size

        randoms = {
            # attr: shape
            # 'emb': (num_embeddings + 1, embedding_dim),
            'M_a': (memory_size, n_article_slots),
            'M_t': (memory_size, n_title_slots),
            'w_a': (n_article_slots,),
            'w_t': (n_title_slots,),
            'Wg_a': (window_size * embedding_dim, n_article_slots),
            'Wg_t': (window_size * embedding_dim, n_title_slots),
            'Wk': (hidden_size, memory_size),
            'Wb': (hidden_size, 1),
            'Wv': (hidden_size, memory_size),
            'We_a': (hidden_size, n_article_slots),
            'We_t': (hidden_size, n_title_slots),
            'Wx': (window_size * embedding_dim, hidden_size),
            'Wh': (memory_size, hidden_size),
            'W': (hidden_size, nclasses),
            'h0': hidden_size
        }

        zeros = {
            # attr: shape
            'bg_a': n_article_slots,
            'bg_t': n_title_slots,
            'bk': memory_size,
            'bb': 1,
            'bv': memory_size,
            'be_a': n_article_slots,
            'be_t': n_title_slots,
            'bh': hidden_size,
            'b': nclasses,
        }

        for l in range(depth):
            randoms['gru' + str(l)] = (1, embedding_dim)

        def random_shared(name):
            shape = randoms[name]
            return theano.shared(
                0.2 * np.random.normal(size=shape).astype(theano.config.floatX),
                name=name)

        def zeros_shared(name):
            shape = zeros[name]
            return theano.shared(np.zeros(shape, dtype=theano.config.floatX), name=name)

        for key in randoms:
            # create an attribute with associated shape and random values
            setattr(self, key, random_shared(key))

        for key in zeros:
            # create an attribute with associated shape and values equal to 0
            setattr(self, key, zeros_shared(key))

        self.names = randoms.keys() + zeros.keys()
        # self.names.remove('emb')  # no need to save or update embeddings
        scan_vars = 'h0 w_a M_a w_t M_t'.split()

        def repeat_for_each_instance(param):
            """ repeat param along new axis once for each instance """
            return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0)

        for key in scan_vars:
            setattr(self, key, repeat_for_each_instance(self.__getattribute__(key)))
            self.names.remove(key)

        if load_dir is not None:
            with open(os.path.join(load_dir, 'params.pkl')) as handle:
                params = pickle.load(handle)
                self.__dict__.update(params)

        def recurrence(i,
                       h_tm1,
                       w_a,
                       M_a,
                       *args,
                       **kwargs):
            """
            notes
            Headers from paper in all caps
            mem = n_article slots if is_article else n_title_slots

            :param i: center index of sliding window
            :param h_tm1: h_{t-1} (hidden state)
            :param w_a: attention weights for article memory
            :param M_a: article memory
            :param args: gru_weights, maybe w_t, maybe M_t
                   gru_weights: weights with which to initialize GRULayer on each time step
                   w_t: attention weights for titles memory
                   M_t: titles memory
            :param kwargs: is_training, is_article
                   is_training:
                   is_article: we use different parts of memory when working with a article
            :return: [y = model outputs,
                      i + 1 = increment index,
                      h w, M (see above)]
            """
            is_training = kwargs['is_training']
            is_article = kwargs['is_article']
            gru_weights = args[:depth]
            if len(args) > depth:
                w_t = args[depth]
                M_t = args[depth + 1]

            i_type = T.iscalar if is_article or is_training else T.ivector
            assert i.type == i_type

            if not is_article:
                assert w_t is not None and M_t is not None

            word_idxs = i
            if is_article or is_training:
                # get representation of word window
                document = articles if is_article else titles  # [instances, bucket_width]
                word_idxs = document[:, i:i+1]  # [instances, 1]
            # x_i = self.emb[word_idxs].flatten(ndim=2)  # [instances, embedding_dim]

            input = InputLayer(shape=(None, 1),
                               input_var=word_idxs)
            embed = EmbeddingLayer(input, num_embeddings, embedding_dim)
            gru = GRULayer(incoming=embed, num_units=embedding_dim, hid_init=self.gru0)
            for weight in gru_weights:
                gru = GRULayer(incoming=gru, num_units=embedding_dim,
                               hid_init=weight)
            x_i = get_output(gru).flatten(ndim=2)
            x_i = Print('x_i')(x_i)  # [instances, embedding_dim]

            gru_weights = []

            if is_article:
                M_read = M_a  # [instances, memory_size, n_article_slots]
                w_read = w_a  # [instances, n_article_slots]
            else:
                M_read = T.concatenate([M_a, M_t], axis=2)  # [instances, memory_size, n_title_slots]
                w_read = T.concatenate([w_a, w_t], axis=1)  # [instances, n_title_slots]

            # eqn 15
            c = T.batched_dot(M_read, w_read)  # [instances, memory_size]

            # EXTERNAL MEMORY READ
            def get_attention(Wg, bg, M, w):
                g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg)  # [instances, mem]

                # eqn 11
                k = T.dot(h_tm1, self.Wk) + self.bk  # [instances, memory_size]

                # eqn 13
                beta = T.dot(h_tm1, self.Wb) + self.bb
                beta = T.nnet.softplus(beta)
                beta = T.addbroadcast(beta, 1)  # [instances, 1]

                # eqn 12
                w_hat = T.nnet.softmax(beta * cosine_dist(M, k))

                # eqn 14
                return (1 - g) * w + g * w_hat  # [instances, mem]

            w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a)  # [instances, n_article_slots]
            if not is_article:
                w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t)  # [instances, n_title_slots]

            # MODEL INPUT AND OUTPUT
            # eqn 9
            h = T.dot(c, self.Wh) + T.dot(x_i, self.Wx) + self.bh  # [instances, hidden_size]

            # eqn 10
            y = T.nnet.softmax(T.dot(h, self.W) + self.b)  # [instances, nclasses]

            # EXTERNAL MEMORY UPDATE
            def update_memory(We, be, w_update, M_update):
                # eqn 17
                e = T.nnet.sigmoid(T.dot(h_tm1, We) + be)  # [instances, mem]
                f = 1. - w_update * e  # [instances, mem]

                # eqn 16
                v = T.tanh(T.dot(h, self.Wv) + self.bv)  # [instances, memory_size]

                # need to add broadcast layers for memory update
                f = f.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                u = w_update.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                v = v.dimshuffle(0, 1, 'x')  # [instances, memory_size, 1]

                # eqn 19
                return M_update * f + T.batched_dot(v, u) * (1 - f)  # [instances, memory_size, mem]

            M_a = update_memory(self.We_a, self.be_a, w_a, M_a)
            attention_and_memory = [w_a, M_a]
            if not is_article:
                M_t = update_memory(self.We_t, self.be_t, w_t, M_t)
                attention_and_memory += [w_t, M_t]

            y_max = y.argmax(axis=1).astype(int32)
            next_idxs = i + 1 if is_training or is_article else y_max
            return [y, y_max, next_idxs, h] + attention_and_memory

        read_article = partial(recurrence, is_training=True, is_article=True)
        # for read_article, it actually doesn't matter whether is_training is true

        i0 = T.constant(0, dtype=int32, name='first_value_of_i')
        gru_weights = [eval('self.gru' + str(l)) for l in range(depth)]
        outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a] + gru_weights

        [_, _, _, h, w, M], _ = theano.scan(fn=read_article,
                                            outputs_info=outputs_info,
                                            n_steps=articles.shape[1],
                                            name='read_scan')

        produce_title = partial(recurrence, is_training=True, is_article=False)
        outputs_info[3:6] = [param[-1, :, :] for param in (h, w, M)]
        outputs_info.extend([self.w_t, self.M_t])
        bucket_width = titles.shape[1] - 1  # subtract 1 because <go> is omitted in y_true
        [y, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title,
                                                      outputs_info=outputs_info,
                                                      n_steps=bucket_width,
                                                      name='train_scan')

        # loss and updates
        y_clip = T.clip(y, .01, .99)
        y_flatten = y_clip.dimshuffle(2, 1, 0).flatten(ndim=2).T
        y_true = titles[:, 1:].ravel()  # [:, 1:] in order to omit <go>
        counts = T.extra_ops.bincount(y_true, assert_nonneg=True)
        weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0)
        losses = T.nnet.categorical_crossentropy(y_flatten, y_true)
        loss = objectives.aggregate(losses, weights, mode='sum')
        updates = adadelta(loss, self.params())

        self.learn = theano.function(inputs=[articles, titles],
                                     outputs=[y_max.T, loss],
                                     updates=updates,
                                     allow_input_downcast=True,
                                     name='learn')

        produce_title_test = partial(recurrence, is_training=False, is_article=False)

        self.test = theano.function(inputs=[articles, titles],
                                    outputs=[y_max.T],
                                    on_unused_input='ignore')

        outputs_info[2] = T.zeros([n_instances], dtype=int32) + go_code
        [_, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title_test,
                                                      outputs_info=outputs_info,
                                                      n_steps=bucket_width,
                                                      name='test_scan')

        self.predict = theano.function(inputs=[articles, titles],
                                       outputs=y_max.T,
                                       name='infer')
Ejemplo n.º 15
0
def zeros_shared(shape):
    return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX))


emb = T.constant(np.tile(np.arange(20).reshape(-1, 1), embedding_dim))

for (attributes, initializer) in ((weights, random_shared),
                                  (biases, zeros_shared)):
    for key in attributes:
        exec key + '= initializer(attributes[key])'

names = weights.keys() + biases.keys()
params = map(eval, names)

questions, docs = T.imatrices(2)  # as many columns as context window size/lines as words in the sentence
is_question = True
inputs = questions if is_question else docs


def repeat_for_each_instance(param):
    """ repeat param along new axis once for each instance """
    return T.repeat(T.shape_padleft(param), repeats=inputs.shape[0], axis=0)


h0, w0, M0 = map(repeat_for_each_instance, [h0, w0, M0])


def recurrence(i, h_tm1, w_previous, M_previous, is_question):
    # get representation of word window
    idxs = questions if is_question else docs  # [instances, bucket_width]
Ejemplo n.º 16
0
    def __init__(self, We, params):

        lstm_layers_num = 1
        en_hidden_size = We.shape[1]
        self.eta = params.eta
        self.num_labels = params.num_labels
        self.en_hidden_size = en_hidden_size
        self.de_hidden_size = params.de_hidden_size

        self.lstm_layers_num = params.lstm_layers_num
        self._train = None
        self._utter = None
        self.params = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []
        self.hos = []
        self.Cos = []

        encoderInputs = tensor.imatrix()
        decoderInputs, decoderTarget = tensor.imatrices(2)
        encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4)

        self.lookuptable = theano.shared(We)

        #### the last one is for the stary symbole
        self.de_lookuptable = theano.shared(name="Decoder LookUpTable",
                                            value=init_xavier_uniform(
                                                self.num_labels + 1,
                                                self.de_hidden_size),
                                            borrow=True)

        self.linear = theano.shared(name="Linear",
                                    value=init_xavier_uniform(
                                        self.de_hidden_size, self.num_labels),
                                    borrow=True)

        self.hidden_decode = theano.shared(name="Hidden to Decode",
                                           value=init_xavier_uniform(
                                               2 * en_hidden_size,
                                               self.de_hidden_size),
                                           borrow=True)

        self.hidden_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.de_hidden_size, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)

        self.params += [
            self.linear, self.de_lookuptable, self.hidden_decode,
            self.hidden_bias
        ]  #concatenate

        #(max_sent_size, batch_size, hidden_size)
        state_below = self.lookuptable[encoderInputs.flatten()].reshape(
            (encoderInputs.shape[0], encoderInputs.shape[1],
             self.en_hidden_size))
        for _ in range(self.lstm_layers_num):

            enclstm_f = LSTM(self.en_hidden_size)
            enclstm_b = LSTM(self.en_hidden_size, True)
            self.encoder_lstm_layers.append(enclstm_f)  #append
            self.encoder_lstm_layers.append(enclstm_b)  #append
            self.params += enclstm_f.params + enclstm_b.params  #concatenate

            hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask)
            hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask)

            hs = tensor.concatenate([hs_f, hs_b], axis=2)
            Cs = tensor.concatenate([Cs_f, Cs_b], axis=2)
            self.hos += tensor.tanh(
                tensor.dot(hs[-1], self.hidden_decode) + self.hidden_bias),
            self.Cos += tensor.tanh(
                tensor.dot(Cs[-1], self.hidden_decode) + self.hidden_bias),
            state_below = hs

        state_below = self.de_lookuptable[decoderInputs.flatten()].reshape(
            (decoderInputs.shape[0], decoderInputs.shape[1],
             self.de_hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.de_hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)

        decoder_lstm_outputs = state_below

        ei, di, dt = tensor.imatrices(3)  #place holders
        em, dm, tf, di0 = tensor.fmatrices(4)
        #####################################################
        #####################################################
        linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear)
        softmax_outputs, updates = theano.scan(
            fn=lambda x: tensor.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]),
                                        y])

        costs, _ = theano.scan(
            fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
        loss = costs.sum() / decoderMask.sum()

        updates = lasagne.updates.adam(loss, self.params, self.eta)
        #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9)

        ###################################################
        #### using the ground truth when training
        ##################################################
        self._train = theano.function(inputs=[ei, em, di, dm, dt],
                                      outputs=[loss, softmax_outputs],
                                      updates=updates,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs: di,
                                          decoderMask: dm,
                                          decoderTarget: dt
                                      })

        #########################################################################
        ### For schedule sampling
        #########################################################################

        ###### always use privous predict as next input
        def _step2(state_, hs_, Cs_):

            hs, Cs = [], []
            token_idxs = tensor.cast(state_.argmax(axis=-1), "int32")
            msk_ = tensor.fill(
                (tensor.zeros_like(token_idxs, dtype="float32")), 1)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, encoderInputs.shape[1], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(
                Cs)

            newpred = tensor.dot(state_below0, self.linear).reshape(
                (encoderInputs.shape[1], self.num_labels))
            state_below = tensor.nnet.softmax(newpred)

            return state_below, hs, Cs

        hs0, Cs0 = tensor.as_tensor_variable(
            self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos,
                                                             name="Cs0")
        train_outputs, _ = theano.scan(fn=_step2,
                                       outputs_info=[decoderInputs0, hs0, Cs0],
                                       n_steps=encoderInputs.shape[0])

        train_predict = train_outputs[0]
        train_costs, _ = theano.scan(
            fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask])

        train_loss = train_costs.sum() / decoderMask.sum()

        train_updates = lasagne.updates.adam(train_loss, self.params, self.eta)
        #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)

        self._train2 = theano.function(
            inputs=[ei, em, di0, dm, dt],
            outputs=[train_loss, train_predict],
            updates=train_updates,
            givens={
                encoderInputs: ei,
                encoderMask: em,
                decoderInputs0: di0,
                decoderMask: dm,
                decoderTarget: dt
            }
            #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf}
        )

        listof_token_idx = train_predict.argmax(axis=-1)
        self._utter = theano.function(inputs=[ei, em, di0],
                                      outputs=listof_token_idx,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs0: di0
                                      })
Ejemplo n.º 17
0
	def __init__(self, voca_size, hidden_size, lstm_layers_num, learning_rate=0.2):
		self.voca_size = voca_size
		self.hidden_size = hidden_size
		self.lstm_layers_num = lstm_layers_num
		self.learning_rate = learning_rate
		self._train = None
		self._utter = None
		self.params = []
		self.encoder_lstm_layers = []
		self.decoder_lstm_layers = []
		self.hos = []
		self.Cos = []

		encoderInputs, encoderMask = tensor.imatrices(2)
		decoderInputs, decoderMask, decoderTarget = tensor.imatrices(3)

		self.lookuptable = theano.shared(
			name="Encoder LookUpTable",
			value=utils.init_norm(self.voca_size, self.hidden_size),
			borrow=True
			)
		self.linear = theano.shared(
			name="Linear",
			value=utils.init_norm(self.hidden_size, self.voca_size),
			borrow=True
			)
		self.params += [self.lookuptable, self.linear]    #concatenate
		
		#(max_sent_size, batch_size, hidden_size)
		state_below = self.lookuptable[encoderInputs.flatten()].reshape((encoderInputs.shape[0], encoderInputs.shape[1], self.hidden_size))
		for _ in range(self.lstm_layers_num):
			enclstm = LSTM(self.hidden_size)
			self.encoder_lstm_layers += enclstm,    #append
			self.params += enclstm.params    #concatenate
			hs, Cs = enclstm.forward(state_below, encoderMask)
			self.hos += hs[-1],
			self.Cos += Cs[-1],
			state_below = hs

		state_below = self.lookuptable[decoderInputs.flatten()].reshape((decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size))
		for i in range(self.lstm_layers_num):
			declstm = LSTM(self.hidden_size)
			self.decoder_lstm_layers += declstm,    #append
			self.params += declstm.params    #concatenate
			ho, Co = self.hos[i], self.Cos[i]
			state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)
		decoder_lstm_outputs = state_below

		ei, em, di, dm, dt = tensor.imatrices(5)    #place holders
		#####################################################
		#####################################################
		linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear)
		softmax_outputs, updates = theano.scan(
			fn=lambda x: tensor.nnet.softmax(x),
			sequences=[linear_outputs],
			)

		def _NLL(pred, y, m):
			return -m * tensor.log(pred[tensor.arange(decoderInputs.shape[1]), y])
		costs, updates = theano.scan(fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
		loss = costs.sum() / decoderMask.sum()

		gparams = [tensor.grad(loss, param) for param in self.params]
		updates = [(param, param - self.learning_rate*gparam) for param, gparam in zip(self.params, gparams)]

		self._train = theano.function(
			inputs=[ei, em, di, dm, dt],
			outputs=[loss, costs],
			updates=updates,
			givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt}
			)
		#####################################################
		#####################################################
		hs0, Cs0 = tensor.as_tensor_variable(self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0")
		token_idxs = tensor.fill( tensor.zeros_like(decoderInputs, dtype="int32"), utils.idx_start)
		msk = tensor.fill( (tensor.zeros_like(decoderInputs, dtype="int32")), 1)

		def _step(token_idxs, hs_, Cs_):
			hs, Cs = [], []
			state_below = self.lookuptable[token_idxs].reshape((decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size))
			for i, lstm in enumerate(self.decoder_lstm_layers):
				h, C = lstm.forward(state_below, msk, hs_[i], Cs_[i])    #mind msk
				hs += h[-1],
				Cs += C[-1],
				state_below = h
			hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(Cs)
			next_token_idx = tensor.cast( tensor.dot(state_below, self.linear).argmax(axis=-1), "int32" )
			return next_token_idx, hs, Cs

		outputs, updates = theano.scan(
			fn=_step,
			outputs_info=[token_idxs, hs0, Cs0],
			n_steps=utils.max_sent_size
			)
		listof_token_idx = outputs[0]
		self._utter = theano.function(
			inputs=[ei, em, di],
			outputs=listof_token_idx,
			givens={encoderInputs:ei, encoderMask:em, decoderInputs:di}
			#givens={encoderInputs:ei, encoderMask:em}
			)
def load_model(model_selected):
    input_var = T.tensor3('input')
    target_var, mislabel = T.imatrices('target', 'mis_label')

    generator = modified_Generator(
        data_folder_train=train_data_dir,
        data_folder_valid=valid_data_dir,
        batch_size=1,
        num_feed_train=
        1,  # number of training samples from each class as baseline
        nb_classes=nb_classes,
        nb_samples_per_class=nb_samples_per_class,
        img_size=img_size,
        max_rotation=0,
        max_shift=0,
        var=0,
        amount=0,
        max_iter=None)

    # Model
    output_var, output_var_flatten, params1 = memory_augmented_neural_network(
        input_var,
        target_var,
        batch_size=generator.batch_size,
        nb_class=generator.nb_classes,
        memory_shape=memory_shape,
        controller_size=controller_size,
        input_size=img_size[0] * img_size[1],
        nb_reads=nb_reads)
    accuracies = accuracy_instance(
        T.argmax(output_var, axis=2),
        target_var,
        mislabel,
        nb_classes=generator.nb_classes,
        nb_samples_per_class=generator.nb_samples_per_class,
        batch_size=generator.batch_size)
    cost = T.mean(
        T.nnet.categorical_crossentropy(output_var_flatten,
                                        target_var.flatten()))

    posterior_fn = theano.function([input_var, target_var], output_var)
    accuracy_fn = theano.function([input_var, target_var, mislabel],
                                  accuracies)
    cost_fn = theano.function([input_var, target_var], cost)

    # load the best trained model
    f = open(model_selected, 'rb')
    loaded_params = cPickle.load(f)
    f.close()

    # set the parameters
    for i in range(len(loaded_params)):
        params1[i].set_value(loaded_params[i])

    d = dict([])
    all_acc = all_mis = np.zeros((0, generator.nb_samples_per_class))
    all_loss, accs = [], np.zeros(generator.nb_samples_per_class)
    all_names = all_classes = np.zeros(
        (0, generator.nb_samples_per_class * generator.nb_classes),
        dtype=np.int32)
    for i, (test_input, test_target, image_names) in generator:
        test_output = np.argmax(posterior_fn(test_input, test_target), axis=2)
        test_mislabel = count_mislabel(generator, image_names)
        acc1, _, mis = accuracy_fn(test_input, test_target, test_mislabel)
        d = update_dict(d, acc1, image_names, generator.num_feed_train)
        all_acc = np.concatenate(
            (all_acc, acc1.reshape([-1, generator.nb_samples_per_class])))
        all_mis = np.concatenate(
            (all_mis, mis.reshape([-1, generator.nb_samples_per_class])))
        cls_label, image_name = prepare_img_names_to_save(image_names)

        all_names = np.concatenate(
            (all_names,
             image_name.reshape(
                 -1, generator.nb_samples_per_class * generator.nb_classes)),
            axis=0)
        all_classes = np.concatenate(
            (all_classes,
             cls_label.reshape(
                 -1, generator.nb_samples_per_class * generator.nb_classes)),
            axis=0)

        loss = cost_fn(test_input, test_target)
        all_loss.append(loss)
        accs += acc1
        if i > 0 and not (i % DISPLAY_FREQ):
            print('Episode %05d: %.6f' % (i, loss))
            print(accs / 100.)
            accs = np.zeros(generator.nb_samples_per_class)
            # save the model parameters, loss and accuracy values every 500 episodes
            if i > 0 and not (i % MODEL_FREQ):
                mislabel_count = prepare_dict_to_save(d)
                h5f = h5py.File(test_path + '/test_Results.h5', 'w')
                h5f.create_dataset('all_acc_episode', data=all_acc)
                h5f.create_dataset('loss_episode', data=all_loss)
                h5f.create_dataset('names_episode', data=all_names)
                h5f.create_dataset('classes_episode', data=all_classes)
                h5f.create_dataset('mis_episode', data=all_mis)
                h5f.create_dataset('mislabel_count', data=mislabel_count)
                h5f.close()
                print(
                    '****************************************************************************************'
                )
    def __init__(self, We, params):

        lstm_layers_num = 1
        en_hidden_size = We.shape[1]
        self.eta = params.eta
        self.num_labels = params.num_labels
        self.en_hidden_size = en_hidden_size
        self.de_hidden_size = params.de_hidden_size

        self.lstm_layers_num = params.lstm_layers_num
        self._train = None
        self._utter = None
        self.params = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []
        self.hos = []
        self.Cos = []

        encoderInputs = tensor.imatrix()
        decoderInputs, decoderTarget = tensor.imatrices(2)
        encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4)

        self.lookuptable = theano.shared(We)

        #### the last one is for the stary symbole
        self.de_lookuptable = theano.shared(name="Decoder LookUpTable",
                                            value=init_xavier_uniform(
                                                self.num_labels + 1,
                                                self.de_hidden_size),
                                            borrow=True)

        self.linear = theano.shared(
            name="Linear",
            value=init_xavier_uniform(self.de_hidden_size + 2 * en_hidden_size,
                                      self.num_labels),
            borrow=True)
        self.linear_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.num_labels, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)

        #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*en_hidden_size, self.de_hidden_size), borrow = True)

        #self.hidden_bias = theano.shared(
        #        name="Hidden to Bias",
        #        value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) ,
        #        borrow=True
        #        )

        #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias]    #concatenate
        self.params += [self.linear, self.linear_bias, self.de_lookuptable
                        ]  #the initial hidden state of decoder lstm is zeros
        #(max_sent_size, batch_size, hidden_size)
        state_below = self.lookuptable[encoderInputs.flatten()].reshape(
            (encoderInputs.shape[0], encoderInputs.shape[1],
             self.en_hidden_size))
        for _ in range(self.lstm_layers_num):

            enclstm_f = LSTM(self.en_hidden_size)
            enclstm_b = LSTM(self.en_hidden_size, True)
            self.encoder_lstm_layers.append(enclstm_f)  #append
            self.encoder_lstm_layers.append(enclstm_b)  #append
            self.params += enclstm_f.params + enclstm_b.params  #concatenate

            hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask)
            hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask)

            hs = tensor.concatenate([hs_f, hs_b], axis=2)
            Cs = tensor.concatenate([Cs_f, Cs_b], axis=2)
            hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1)
            Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1)
            #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
            #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
            self.hos += tensor.alloc(
                np.asarray(0., dtype=theano.config.floatX),
                encoderInputs.shape[1], self.de_hidden_size),
            self.Cos += tensor.alloc(
                np.asarray(0., dtype=theano.config.floatX),
                encoderInputs.shape[1], self.de_hidden_size),
            state_below = hs

        Encoder = state_below

        ei, di, dt = tensor.imatrices(3)  #place holders
        em, dm, tf, di0 = tensor.fmatrices(4)

        self.encoder_function = theano.function(inputs=[ei, em],
                                                outputs=Encoder,
                                                givens={
                                                    encoderInputs: ei,
                                                    encoderMask: em
                                                })

        #####################################################
        #####################################################
        state_below = self.de_lookuptable[decoderInputs.flatten()].reshape(
            (decoderInputs.shape[0], decoderInputs.shape[1],
             self.de_hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.de_hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)

        ##### Here we include the representation from the decoder
        decoder_lstm_outputs = tensor.concatenate([state_below, Encoder],
                                                  axis=2)

        linear_outputs = tensor.dot(decoder_lstm_outputs,
                                    self.linear) + self.linear_bias[None,
                                                                    None, :]
        softmax_outputs, _ = theano.scan(
            fn=lambda x: tensor.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]),
                                        y])

        costs, _ = theano.scan(
            fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
        loss = costs.sum() / decoderMask.sum() + params.L2 * sum(
            lasagne.regularization.l2(x) for x in self.params)

        updates = lasagne.updates.adam(loss, self.params, self.eta)
        #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9)

        ###################################################
        #### using the ground truth when training
        ##################################################
        self._train = theano.function(inputs=[ei, em, di, dm, dt],
                                      outputs=[loss, softmax_outputs],
                                      updates=updates,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs: di,
                                          decoderMask: dm,
                                          decoderTarget: dt
                                      })

        #########################################################################
        ### For schedule sampling
        #########################################################################

        ###### always use privous predict as next input
        def _step2(ctx_, state_, hs_, Cs_):
            ### ctx_: b x h
            ### state_ : b x h
            ### hs_ : 1 x b x h    the first dimension is the number of the decoder layers
            ### Cs_ : 1 x b x h    the first dimension is the number of the decoder layers

            hs, Cs = [], []
            token_idxs = tensor.cast(state_.argmax(axis=-1), "int32")
            msk_ = tensor.fill(
                (tensor.zeros_like(token_idxs, dtype="float32")), 1)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, ctx_.shape[0], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(
                Cs)
            state_below0 = state_below0.reshape(
                (ctx_.shape[0], self.de_hidden_size))
            state_below0 = tensor.concatenate([ctx_, state_below0], axis=1)
            newpred = tensor.dot(state_below0,
                                 self.linear) + self.linear_bias[None, :]
            state_below = tensor.nnet.softmax(newpred)

            ##### the beging symbole probablity is 0
            extra_p = tensor.zeros_like(hs[:, :, 0])
            state_below = tensor.concatenate([state_below, extra_p.T], axis=1)

            return state_below, hs, Cs

        ctx_0, state_0 = tensor.fmatrices(2)
        hs_0 = tensor.ftensor3()
        Cs_0 = tensor.ftensor3()

        state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0)
        self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0],
                                      [state_below_tmp, hs_tmp, Cs_tmp],
                                      name='f_next')

        hs0, Cs0 = tensor.as_tensor_variable(
            self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos,
                                                             name="Cs0")
        train_outputs, _ = theano.scan(fn=_step2,
                                       sequences=[Encoder],
                                       outputs_info=[decoderInputs0, hs0, Cs0],
                                       n_steps=encoderInputs.shape[0])

        train_predict = train_outputs[0]
        train_costs, _ = theano.scan(
            fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask])

        train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum(
            lasagne.regularization.l2(x) for x in self.params)

        ##from adam import adam
        ##train_updates = adam(train_loss, self.params, self.eta)
        #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)
        #train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta)
        #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)
        from momentum import momentum
        train_updates = momentum(train_loss,
                                 self.params,
                                 params.eta,
                                 momentum=0.9)

        self._train2 = theano.function(
            inputs=[ei, em, di0, dm, dt],
            outputs=[train_loss, train_predict],
            updates=train_updates,
            givens={
                encoderInputs: ei,
                encoderMask: em,
                decoderInputs0: di0,
                decoderMask: dm,
                decoderTarget: dt
            }
            #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf}
        )

        listof_token_idx = train_predict.argmax(axis=-1)
        self._utter = theano.function(inputs=[ei, em, di0],
                                      outputs=listof_token_idx,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs0: di0
                                      })
Ejemplo n.º 20
0
    def __init__(self,
                 voca_size,
                 hidden_size,
                 lstm_layers_num,
                 learning_rate=0.2):
        self.voca_size = voca_size
        self.hidden_size = hidden_size
        self.lstm_layers_num = lstm_layers_num
        self.learning_rate = learning_rate
        self._train = None
        self._utter = None
        self.params = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []
        self.hos = []
        self.Cos = []

        encoderInputs, encoderMask = tensor.imatrices(2)
        decoderInputs, decoderMask, decoderTarget = tensor.imatrices(3)

        self.lookuptable = theano.shared(name="Encoder LookUpTable",
                                         value=utils.init_norm(
                                             self.voca_size, self.hidden_size),
                                         borrow=True)
        self.linear = theano.shared(name="Linear",
                                    value=utils.init_norm(
                                        self.hidden_size, self.voca_size),
                                    borrow=True)
        self.params += [self.lookuptable, self.linear]  #concatenate

        #(max_sent_size, batch_size, hidden_size)
        state_below = self.lookuptable[encoderInputs.flatten()].reshape(
            (encoderInputs.shape[0], encoderInputs.shape[1], self.hidden_size))
        for _ in range(self.lstm_layers_num):
            enclstm = LSTM(self.hidden_size)
            self.encoder_lstm_layers += enclstm,  #append
            self.params += enclstm.params  #concatenate
            hs, Cs = enclstm.forward(state_below, encoderMask)
            self.hos += hs[-1],
            self.Cos += Cs[-1],
            state_below = hs

        state_below = self.lookuptable[decoderInputs.flatten()].reshape(
            (decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)
        decoder_lstm_outputs = state_below

        ei, em, di, dm, dt = tensor.imatrices(5)  #place holders
        #####################################################
        #####################################################
        linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear)
        softmax_outputs, updates = theano.scan(
            fn=lambda x: tensor.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * tensor.log(pred[tensor.arange(decoderInputs.shape[1]),
                                        y])

        costs, updates = theano.scan(
            fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
        loss = costs.sum() / decoderMask.sum()

        gparams = [tensor.grad(loss, param) for param in self.params]
        updates = [(param, param - self.learning_rate * gparam)
                   for param, gparam in zip(self.params, gparams)]

        self._train = theano.function(inputs=[ei, em, di, dm, dt],
                                      outputs=[loss, costs],
                                      updates=updates,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs: di,
                                          decoderMask: dm,
                                          decoderTarget: dt
                                      })
        #####################################################
        #####################################################
        hs0, Cs0 = tensor.as_tensor_variable(
            self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos,
                                                             name="Cs0")
        token_idxs = tensor.fill(
            tensor.zeros_like(decoderInputs, dtype="int32"), utils.idx_start)
        msk = tensor.fill((tensor.zeros_like(decoderInputs, dtype="int32")), 1)

        def _step(token_idxs, hs_, Cs_):
            hs, Cs = [], []
            state_below = self.lookuptable[token_idxs].reshape(
                (decoderInputs.shape[0], decoderInputs.shape[1],
                 self.hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below, msk, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below = h
            hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(
                Cs)
            next_token_idx = tensor.cast(
                tensor.dot(state_below, self.linear).argmax(axis=-1), "int32")
            return next_token_idx, hs, Cs

        outputs, updates = theano.scan(fn=_step,
                                       outputs_info=[token_idxs, hs0, Cs0],
                                       n_steps=utils.max_sent_size)
        listof_token_idx = outputs[0]
        self._utter = theano.function(
            inputs=[ei, em, di],
            outputs=listof_token_idx,
            givens={
                encoderInputs: ei,
                encoderMask: em,
                decoderInputs: di
            }
            #givens={encoderInputs:ei, encoderMask:em}
        )